<!DOCTYPE HTML>
<html lang="zh-CN">


<head>
    <meta charset="utf-8">
    <meta name="keywords" content="scikit-learn系列二：K-近邻, 欢迎来到，TWOTO 的博客">
    <meta name="description" content="技术、效率、摄影">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=no">
    <meta name="renderer" content="webkit|ie-stand|ie-comp">
    <meta name="mobile-web-app-capable" content="yes">
    <meta name="format-detection" content="telephone=no">
    <meta name="apple-mobile-web-app-capable" content="yes">
    <meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">
    <!-- Global site tag (gtag.js) - Google Analytics -->


    <title>scikit-learn系列二：K-近邻 | 欢迎来到，TWOTO 的博客</title>
    <link rel="icon" type="image/png" href="/twoto.png">

    <link rel="stylesheet" type="text/css" href="https://cdn.jsdelivr.net/gh/DongZhouGu/DongZhouGu.github.io/libs/awesome/css/all.css">
    <link rel="stylesheet" type="text/css" href="https://cdn.jsdelivr.net/gh/DongZhouGu/DongZhouGu.github.io/libs/materialize/materialize.min.css">
    <link rel="stylesheet" type="text/css" href="https://cdn.jsdelivr.net/gh/DongZhouGu/DongZhouGu.github.io/libs/aos/aos.css">
    <link rel="stylesheet" type="text/css" href="https://cdn.jsdelivr.net/gh/DongZhouGu/DongZhouGu.github.io/libs/animate/animate.min.css">
    <link rel="stylesheet" type="text/css" href="https://cdn.jsdelivr.net/gh/DongZhouGu/DongZhouGu.github.io/libs/lightGallery/css/lightgallery.min.css">
    <link rel="stylesheet" type="text/css" href="/css/matery.css">
    <link rel="stylesheet" type="text/css" href="/css/my.css">

    <script src="https://cdn.jsdelivr.net/gh/DongZhouGu/DongZhouGu.github.io/libs/jquery/jquery.min.js"></script>

<style type="text/css" lang="css">
    #loading-container{
        position: fixed;
        top: 0;
        left: 0;
        min-height: 100vh;
        width: 100vw;
        z-index: 9999;
        display: flex;
        flex-direction: column;
        justify-content: center;
        align-items: center;
        background: #FFF;
        text-align: center;
        /* loader页面消失采用渐隐的方式*/
        -webkit-transition: opacity 1s ease;
        -moz-transition: opacity 1s ease;
        -o-transition: opacity 1s ease;
        transition: opacity 1s ease;
    }
    .loading-image{
        width: 120px;
        height: 50px;
        transform: translate(-50%);
    }
    
    .loading-image div:nth-child(2) {
        -webkit-animation: pacman-balls 1s linear 0s infinite;
        animation: pacman-balls 1s linear 0s infinite
    }

    .loading-image div:nth-child(3) {
        -webkit-animation: pacman-balls 1s linear .33s infinite;
        animation: pacman-balls 1s linear .33s infinite
    }

    .loading-image div:nth-child(4) {
        -webkit-animation: pacman-balls 1s linear .66s infinite;
        animation: pacman-balls 1s linear .66s infinite
    }

    .loading-image div:nth-child(5) {
        -webkit-animation: pacman-balls 1s linear .99s infinite;
        animation: pacman-balls 1s linear .99s infinite
    }
    
   .loading-image div:first-of-type {
        width: 0;
        height: 0;
        border: 25px solid #49b1f5;
        border-right-color: transparent;
        border-radius: 25px;
        -webkit-animation: rotate_pacman_half_up .5s 0s infinite;
        animation: rotate_pacman_half_up .5s 0s infinite;
    }
    .loading-image div:nth-child(2) {
        width: 0;
        height: 0;
        border: 25px solid #49b1f5;
        border-right-color: transparent;
        border-radius: 25px;
        -webkit-animation: rotate_pacman_half_down .5s 0s infinite;
        animation: rotate_pacman_half_down .5s 0s infinite;
        margin-top: -50px;
    }
    @-webkit-keyframes rotate_pacman_half_up {0% {transform: rotate(270deg)}50% {transform: rotate(1turn)}to {transform: rotate(270deg)}}

    @keyframes rotate_pacman_half_up {0% {transform: rotate(270deg)}50% {transform: rotate(1turn)}to {transform: rotate(270deg)}}

    @-webkit-keyframes rotate_pacman_half_down {0% {transform: rotate(90deg)}50% {transform: rotate(0deg)}to {transform: rotate(90deg)}}

    @keyframes rotate_pacman_half_down {0% {transform: rotate(90deg)}50% {transform: rotate(0deg)}to {transform: rotate(90deg)}}
    
    @-webkit-keyframes pacman-balls {75% {opacity: .7}to {transform: translate(-100px, -6.25px)}}

    @keyframes pacman-balls {75% {opacity: .7}to {transform: translate(-100px, -6.25px)}}
    
   
    .loading-image div:nth-child(3),
    .loading-image div:nth-child(4),
    .loading-image div:nth-child(5),
    .loading-image div:nth-child(6){
        background-color: #49b1f5;
        width: 15px;
        height: 15px;
        border-radius: 100%;
        margin: 2px;
        width: 10px;
        height: 10px;
        position: absolute;
        transform: translateY(-6.25px);
        top: 25px;
        left: 100px;
    }
    .loading-text{
        margin-bottom: 20vh;
        text-align: center;
        color: #2c3e50;
        font-size: 2rem;
        box-sizing: border-box;
        padding: 0 10px;
        text-shadow: 0 2px 10px rgba(0,0,0,0.2);
    }
    @media only screen and (max-width: 500px) {
         .loading-text{
            font-size: 1.5rem;
         }
    }
    .fadeout {
        opacity: 0;
        filter: alpha(opacity=0);
    }
    /* logo出现动画 */
    @-webkit-keyframes fadeInDown{0%{opacity:0;-webkit-transform:translate3d(0,-100%,0);transform:translate3d(0,-100%,0)}100%{opacity:1;-webkit-transform:none;transform:none}}
    @keyframes fadeInDown{0%{opacity:0;-webkit-transform:translate3d(0,-100%,0);}}
 </style>
 <script>
(function () {
    const loaded = function(){
       setTimeout(function(){
            const loader = document.getElementById("loading-container");
            loader.className="fadeout" ;//使用渐隐的方法淡出loading page
            // document.getElementById("body-wrap").style.display="flex";
            setTimeout(function(){
                loader.style.display="none";
            },1000); 
        },1000);//强制显示loading page 1s  
    };
    loaded();
})()
 </script><meta name="generator" content="Hexo 4.2.1"><link rel="alternate" href="/atom.xml" title="欢迎来到，TWOTO 的博客" type="application/atom+xml">
<link rel="stylesheet" href="/css/prism-tomorrow.css" type="text/css">
<link rel="stylesheet" href="/css/prism-line-numbers.css" type="text/css"><style type="text/css" lang="css">
    #loading-container{
        position: fixed;
        top: 0;
        left: 0;
        min-height: 100vh;
        width: 100vw;
        z-index: 9999;
        display: flex;
        flex-direction: column;
        justify-content: center;
        align-items: center;
        background: #FFF;
        text-align: center;
        /* loader页面消失采用渐隐的方式*/
        -webkit-transition: opacity 1s ease;
        -moz-transition: opacity 1s ease;
        -o-transition: opacity 1s ease;
        transition: opacity 1s ease;
    }
    .loading-image{
        width: 120px;
        height: 50px;
        transform: translate(-50%);
    }
    
    .loading-image div:nth-child(2) {
        -webkit-animation: pacman-balls 1s linear 0s infinite;
        animation: pacman-balls 1s linear 0s infinite
    }

    .loading-image div:nth-child(3) {
        -webkit-animation: pacman-balls 1s linear .33s infinite;
        animation: pacman-balls 1s linear .33s infinite
    }

    .loading-image div:nth-child(4) {
        -webkit-animation: pacman-balls 1s linear .66s infinite;
        animation: pacman-balls 1s linear .66s infinite
    }

    .loading-image div:nth-child(5) {
        -webkit-animation: pacman-balls 1s linear .99s infinite;
        animation: pacman-balls 1s linear .99s infinite
    }
    
   .loading-image div:first-of-type {
        width: 0;
        height: 0;
        border: 25px solid #49b1f5;
        border-right-color: transparent;
        border-radius: 25px;
        -webkit-animation: rotate_pacman_half_up .5s 0s infinite;
        animation: rotate_pacman_half_up .5s 0s infinite;
    }
    .loading-image div:nth-child(2) {
        width: 0;
        height: 0;
        border: 25px solid #49b1f5;
        border-right-color: transparent;
        border-radius: 25px;
        -webkit-animation: rotate_pacman_half_down .5s 0s infinite;
        animation: rotate_pacman_half_down .5s 0s infinite;
        margin-top: -50px;
    }
    @-webkit-keyframes rotate_pacman_half_up {0% {transform: rotate(270deg)}50% {transform: rotate(1turn)}to {transform: rotate(270deg)}}

    @keyframes rotate_pacman_half_up {0% {transform: rotate(270deg)}50% {transform: rotate(1turn)}to {transform: rotate(270deg)}}

    @-webkit-keyframes rotate_pacman_half_down {0% {transform: rotate(90deg)}50% {transform: rotate(0deg)}to {transform: rotate(90deg)}}

    @keyframes rotate_pacman_half_down {0% {transform: rotate(90deg)}50% {transform: rotate(0deg)}to {transform: rotate(90deg)}}
    
    @-webkit-keyframes pacman-balls {75% {opacity: .7}to {transform: translate(-100px, -6.25px)}}

    @keyframes pacman-balls {75% {opacity: .7}to {transform: translate(-100px, -6.25px)}}
    
   
    .loading-image div:nth-child(3),
    .loading-image div:nth-child(4),
    .loading-image div:nth-child(5),
    .loading-image div:nth-child(6){
        background-color: #49b1f5;
        width: 15px;
        height: 15px;
        border-radius: 100%;
        margin: 2px;
        width: 10px;
        height: 10px;
        position: absolute;
        transform: translateY(-6.25px);
        top: 25px;
        left: 100px;
    }
    .loading-text{
        margin-bottom: 20vh;
        text-align: center;
        color: #2c3e50;
        font-size: 2rem;
        box-sizing: border-box;
        padding: 0 10px;
        text-shadow: 0 2px 10px rgba(0,0,0,0.2);
    }
    @media only screen and (max-width: 500px) {
         .loading-text{
            font-size: 1.5rem;
         }
    }
    .fadeout {
        opacity: 0;
        filter: alpha(opacity=0);
    }
    /* logo出现动画 */
    @-webkit-keyframes fadeInDown{0%{opacity:0;-webkit-transform:translate3d(0,-100%,0);transform:translate3d(0,-100%,0)}100%{opacity:1;-webkit-transform:none;transform:none}}
    @keyframes fadeInDown{0%{opacity:0;-webkit-transform:translate3d(0,-100%,0);}}
 </style>
 <script>
(function () {
    const loaded = function(){
       setTimeout(function(){
            const loader = document.getElementById("loading-container");
            loader.className="fadeout" ;//使用渐隐的方法淡出loading page
            // document.getElementById("body-wrap").style.display="flex";
            setTimeout(function(){
                loader.style.display="none";
            },1000); 
        },1000);//强制显示loading page 1s  
    };
    loaded();
})()
 </script></head>



 <div id="loading-container">
     <p class="loading-text">玩命加载中 . . . </p> 
     <div class="loading-image">
         <div></div>
         <div></div>
         <div></div>
         <div></div> 
         <div></div>
     </div>
 </div><body>
<!--动态线条背景-->
<script type="text/javascript"
color="122 103 238" opacity='0.5' zIndex="-1" count="200"
src="//cdn.bootcss.com/canvas-nest.js/1.0.0/canvas-nest.min.js">
</script>

<header class="navbar-fixed">
    <nav id="headNav" class="bg-color nav-transparent">
        <div id="navContainer" class="nav-wrapper container">
            <div class="brand-logo">
                <a href="/" class="waves-effect waves-light">
                    
                    <img src="/medias/mylogo.png" class="logo-img" alt="LOGO">
                    
                    <span class="logo-span">欢迎来到，TWOTO 的博客</span>
                </a>
            </div>
            

<a href="#" data-target="mobile-nav" class="sidenav-trigger button-collapse"><i class="fas fa-bars"></i></a>
<ul class="right nav-menu">
  
  <li class="hide-on-med-and-down nav-item">
    
    <a href="/" class="waves-effect waves-light">
      
      <i class="fas fa-home" style="zoom: 0.6;"></i>
      
      <span>首页</span>
    </a>
    
  </li>
  
  <li class="hide-on-med-and-down nav-item">
    
    <a href="/tags" class="waves-effect waves-light">
      
      <i class="fas fa-tags" style="zoom: 0.6;"></i>
      
      <span>标签</span>
    </a>
    
  </li>
  
  <li class="hide-on-med-and-down nav-item">
    
    <a href="/categories" class="waves-effect waves-light">
      
      <i class="fas fa-bookmark" style="zoom: 0.6;"></i>
      
      <span>分类</span>
    </a>
    
  </li>
  
  <li class="hide-on-med-and-down nav-item">
    
    <a href="/archives" class="waves-effect waves-light">
      
      <i class="fas fa-archive" style="zoom: 0.6;"></i>
      
      <span>归档</span>
    </a>
    
  </li>
  
  <li class="hide-on-med-and-down nav-item">
    
    <a href="/about" class="waves-effect waves-light">
      
      <i class="fas fa-user-circle" style="zoom: 0.6;"></i>
      
      <span>关于</span>
    </a>
    
  </li>
  
  <li class="hide-on-med-and-down nav-item">
    
    <a href="/contact" class="waves-effect waves-light">
      
      <i class="fas fa-comments" style="zoom: 0.6;"></i>
      
      <span>留言板</span>
    </a>
    
  </li>
  
  <li class="hide-on-med-and-down nav-item">
    
    <a href="/friends" class="waves-effect waves-light">
      
      <i class="fas fa-address-book" style="zoom: 0.6;"></i>
      
      <span>友情链接</span>
    </a>
    
  </li>
  
  <li>
    <a href="#searchModal" class="modal-trigger waves-effect waves-light">
      <i id="searchIcon" class="fas fa-search" title="搜索" style="zoom: 0.85;"></i>
    </a>
  </li>
</ul>


<div id="mobile-nav" class="side-nav sidenav">

    <div class="mobile-head bg-color">
        
        <img src="/medias/mylogo.png" class="logo-img circle responsive-img">
        
        <div class="logo-name">欢迎来到，TWOTO 的博客</div>
        <div class="logo-desc">
            
            技术、效率、摄影
            
        </div>
    </div>

    

    <ul class="menu-list mobile-menu-list">
        
        <li class="m-nav-item">
	  
		<a href="/" class="waves-effect waves-light">
			
			    <i class="fa-fw fas fa-home"></i>
			
			首页
		</a>
          
        </li>
        
        <li class="m-nav-item">
	  
		<a href="/tags" class="waves-effect waves-light">
			
			    <i class="fa-fw fas fa-tags"></i>
			
			标签
		</a>
          
        </li>
        
        <li class="m-nav-item">
	  
		<a href="/categories" class="waves-effect waves-light">
			
			    <i class="fa-fw fas fa-bookmark"></i>
			
			分类
		</a>
          
        </li>
        
        <li class="m-nav-item">
	  
		<a href="/archives" class="waves-effect waves-light">
			
			    <i class="fa-fw fas fa-archive"></i>
			
			归档
		</a>
          
        </li>
        
        <li class="m-nav-item">
	  
		<a href="/about" class="waves-effect waves-light">
			
			    <i class="fa-fw fas fa-user-circle"></i>
			
			关于
		</a>
          
        </li>
        
        <li class="m-nav-item">
	  
		<a href="/contact" class="waves-effect waves-light">
			
			    <i class="fa-fw fas fa-comments"></i>
			
			留言板
		</a>
          
        </li>
        
        <li class="m-nav-item">
	  
		<a href="/friends" class="waves-effect waves-light">
			
			    <i class="fa-fw fas fa-address-book"></i>
			
			友情链接
		</a>
          
        </li>
        
        
        <li><div class="divider"></div></li>
        <li>
            <a href="https://github.com/DongZhouGu" class="waves-effect waves-light" target="_blank">
                <i class="fab fa-github-square fa-fw"></i>Fork Me
            </a>
        </li>
        
    </ul>
</div>


        </div>

        
            <style>
    .nav-transparent .github-corner {
        display: none !important;
    }

    .github-corner {
        position: absolute;
        z-index: 10;
        top: 0;
        right: 0;
        border: 0;
        transform: scale(1.1);
    }

    .github-corner svg {
        color: #0f9d58;
        fill: #fff;
        height: 64px;
        width: 64px;
    }

    .github-corner:hover .octo-arm {
        animation: a 0.56s ease-in-out;
    }

    .github-corner .octo-arm {
        animation: none;
    }

    @keyframes a {
        0%,
        to {
            transform: rotate(0);
        }
        20%,
        60% {
            transform: rotate(-25deg);
        }
        40%,
        80% {
            transform: rotate(10deg);
        }
    }
</style>

<a href="https://github.com/DongZhouGu" class="github-corner tooltipped hide-on-med-and-down" target="_blank"
   data-tooltip="Fork Me" data-position="left" data-delay="50">
    <svg viewBox="0 0 250 250" aria-hidden="true">
        <path d="M0,0 L115,115 L130,115 L142,142 L250,250 L250,0 Z"></path>
        <path d="M128.3,109.0 C113.8,99.7 119.0,89.6 119.0,89.6 C122.0,82.7 120.5,78.6 120.5,78.6 C119.2,72.0 123.4,76.3 123.4,76.3 C127.3,80.9 125.5,87.3 125.5,87.3 C122.9,97.6 130.6,101.9 134.4,103.2"
              fill="currentColor" style="transform-origin: 130px 106px;" class="octo-arm"></path>
        <path d="M115.0,115.0 C114.9,115.1 118.7,116.5 119.8,115.4 L133.7,101.6 C136.9,99.2 139.9,98.4 142.2,98.6 C133.8,88.0 127.5,74.4 143.8,58.0 C148.5,53.4 154.0,51.2 159.7,51.0 C160.3,49.4 163.2,43.6 171.4,40.1 C171.4,40.1 176.1,42.5 178.8,56.2 C183.1,58.6 187.2,61.8 190.9,65.4 C194.5,69.0 197.7,73.2 200.1,77.6 C213.8,80.2 216.3,84.9 216.3,84.9 C212.7,93.1 206.9,96.0 205.4,96.6 C205.1,102.4 203.0,107.8 198.3,112.5 C181.9,128.9 168.3,122.5 157.7,114.1 C157.9,116.9 156.7,120.9 152.7,124.9 L141.0,136.5 C139.8,137.7 141.6,141.9 141.8,141.8 Z"
              fill="currentColor" class="octo-body"></path>
    </svg>
</a>
        
    </nav>

</header>





<div class="bg-cover pd-header post-cover" style="background-image: url('https://cdn.jsdelivr.net/gh/DongZhouGu/DongZhouGu.github.io/medias/featureimages/6.jpg')">
    <div class="container" style="right: 0px;left: 0px;">
        <div class="row">
            <div class="col s12 m12 l12">
                <div class="brand">
                    <h1 class="description center-align post-title">scikit-learn系列二：K-近邻</h1>
                </div>
            </div>
        </div>
    </div>
</div>




<main class="post-container content">

    
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/DongZhouGu/DongZhouGu.github.io/libs/tocbot/tocbot.css">
<style>
    #articleContent h1::before,
    #articleContent h2::before,
    #articleContent h3::before,
    #articleContent h4::before,
    #articleContent h5::before,
    #articleContent h6::before {
        display: block;
        content: " ";
        height: 100px;
        margin-top: -100px;
        visibility: hidden;
    }

    #articleContent :focus {
        outline: none;
    }

    .toc-fixed {
        position: fixed;
        top: 64px;
    }

    .toc-widget {
        width: 345px;
        padding-left: 20px;
    }

    .toc-widget .toc-title {
        margin: 35px 0 15px 0;
        padding-left: 17px;
        font-size: 1.5rem;
        font-weight: bold;
        line-height: 1.5rem;
    }

    .toc-widget ol {
        padding: 0;
        list-style: none;
    }

    #toc-content {
        height: calc(100vh - 250px);
        overflow: auto;
    }

    #toc-content ol {
        padding-left: 10px;
    }

    #toc-content ol li {
        padding-left: 10px;
    }

    #toc-content .toc-link:hover {
        color: #42b983;
        font-weight: 700;
        text-decoration: underline;
    }

    #toc-content .toc-link::before {
        background-color: transparent;
        max-height: 25px;

        position: absolute;
        right: 23.5vw;
        display: block;
    }

    #toc-content .is-active-link {
        color: #42b983;
    }

    #floating-toc-btn {
        position: fixed;
        right: 15px;
        bottom: 76px;
        padding-top: 15px;
        margin-bottom: 0;
        z-index: 998;
    }

    #floating-toc-btn .btn-floating {
        width: 48px;
        height: 48px;
    }

    #floating-toc-btn .btn-floating i {
        line-height: 48px;
        font-size: 1.4rem;
    }
</style>
<div class="row">
    <div id="main-content" class="col s12 m12 l9">
        <!-- 文章内容详情 -->
<div id="artDetail">
    <div class="card">
        <div class="card-content article-info">
            <div class="row tag-cate">
                <div class="col s7">
                    
                    <div class="article-tag">
                        
                            <a href="/tags/%E5%9F%BA%E7%A1%80%E7%9F%A5%E8%AF%86/">
                                <span class="chip bg-color">基础知识</span>
                            </a>
                        
                            <a href="/tags/ML%E7%AE%97%E6%B3%95/">
                                <span class="chip bg-color">ML算法</span>
                            </a>
                        
                    </div>
                    
                </div>
                <div class="col s5 right-align">
                    
                    <div class="post-cate">
                        <i class="fas fa-bookmark fa-fw icon-category"></i>
                        
                            <a href="/categories/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0/" class="post-category">
                                机器学习
                            </a>
                        
                    </div>
                    
                </div>
            </div>

            <div class="post-info">
                
                <div class="post-date info-break-policy">
                    <i class="far fa-calendar-minus fa-fw"></i>发布日期:&nbsp;&nbsp;
                    2020-06-29
                </div>
                

                

                
                <div class="info-break-policy">
                    <i class="far fa-file-word fa-fw"></i>文章字数:&nbsp;&nbsp;
                    5.3k
                </div>
                

                

                
                    <div id="busuanzi_container_page_pv" class="info-break-policy">
                        <i class="far fa-eye fa-fw"></i>阅读次数:&nbsp;&nbsp;
                        <span id="busuanzi_value_page_pv"></span>
                    </div>
				
            </div>
        </div>
        <hr class="clearfix">
        <div class="card-content article-card-content">
            <div id="articleContent">
                <h2 id="项目地址传送门，欢迎-star-和-fork-！"><a href="#项目地址传送门，欢迎-star-和-fork-！" class="headerlink" title="项目地址传送门，欢迎 star 和 fork ！"></a>项目地址<a href="https://github.com/DongZhouGu/scikit-learn-ml" target="_blank" rel="noopener">传送门</a>，欢迎 star 和 fork ！</h2><h2 id="KNN-概述"><a href="#KNN-概述" class="headerlink" title="KNN 概述"></a>KNN 概述</h2><p>KNN（K-Nearest Neighbor，K-近邻算法）算法是一种<strong>有监督</strong>的机器学习算法，可以解决分类问题，也可以解决回归问题。</p>
<blockquote>
<p><strong>一句话总结: 近朱者赤近墨者黑！</strong> </p>
</blockquote>
<p>k -近邻算法的输入为实例的特征向量，对应于特征空间的点；输出为实例的类别，可以取多类。k 近邻算法假设给定一个训练数据集，其中的实例类别已定。分类时，对新的实例，根据其 k 个最近邻的训练实例的类别，通过多数表决等方式进行预测。因此，k近邻算法不具有显式的学习过程。</p>
<h2 id="KNN算法原理"><a href="#KNN算法原理" class="headerlink" title="KNN算法原理"></a>KNN算法原理</h2><p>K-近邻算法的核心思想是未标记样本的类别，由距离其最近的 K 个邻居投票来决定。</p>
<p>假设，我们有一个已经标记的数据集，即已经知道了数据集中每个样本所属的类别。此时，有一个未标记的数据样本，我们的任务是预测出这个数据样本所属的类别。<strong>K-近邻算法的原理是，计算待标记的数据样本和数据集中每个样本的距离，取距离最近的K个样本。</strong>待标记的数据样本所属的类别，就由这K个距离最近的样本投票产生。</p>
<blockquote>
<p>KNN工作原理</p>
</blockquote>
<p>假设X_test为待标记的数据样本，X_train为已标记的数据集，算法原理的伪代码如下：</p>
<ul>
<li>遍历 X_train 中的所有样本，计算每个样本与 X_test 的距离，并把距离保存在 Distance 数组中。</li>
<li>对 Distance 数组进行排序，取距离最近的K个点，记为 X_knn 。</li>
<li>在 X_knn 中统计每个类别的个数，即 class0 在 X_knn 中有几个样本，class1 在 X_knn 中有几个样本等。</li>
<li>待标记样本的类别，就是在 X_knn 中样本数最多的那个类别。</li>
</ul>
<blockquote>
<p>KNN算法优缺点</p>
</blockquote>
<ul>
<li>优点：准确度高，对异常值和噪声有较高的容忍度。</li>
<li>缺点：计算复杂度高、空间复杂度高，从算法原理可以看出，每次对一个未标记样本进行分类时，都需要全部计算一遍距离。</li>
</ul>
<blockquote>
<p>KNN算法参数</p>
</blockquote>
<p>其算法参数是K，参数选择需要根据数据来决定。K值越大，模型的偏差越大，对噪声数据越不敏感，当K值很大时，可能造成模型欠拟合；K值越小，模型的方差就会越大，当K值太小，就会造成模型过拟合。</p>
<blockquote>
<p>KNN算法变种</p>
</blockquote>
<p>K-近邻算法有一些变种，其中之一就是可以增加邻居的权重。默认情况下，在计算距离时，都是使用相同的权重。实际上，我们可以针对不同的邻居指定不同的权重，如距离越近权重越高。这个可以通过指定算法的weights参数来实现。</p>
<p>另外一个变种是，使用一定半径内的点取代距离最近的K个点。在 <code>scikit-learn</code> 里，<code>RadiusNeighborsClassifier</code> 类实现了这个算法的变种。当数据采样不均匀时，该算法变种可以取得更好的性能。</p>
<h2 id="KNN-项目案例"><a href="#KNN-项目案例" class="headerlink" title="KNN 项目案例"></a>KNN 项目案例</h2><h3 id="案例1-使用KNN算法进行分类"><a href="#案例1-使用KNN算法进行分类" class="headerlink" title="案例1: 使用KNN算法进行分类"></a>案例1: 使用KNN算法进行分类</h3><p>完整代码地址：</p>
<p>在 <code>scikit-learn</code>里，使用K-近邻算法进行分类处理的是 <code>sklearn.neightbors.KNeightborsClassifier</code> 类。</p>
<h4 id="rainbow-生成数据集"><a href="#rainbow-生成数据集" class="headerlink" title=":rainbow:   生成数据集"></a>:rainbow:   生成数据集</h4><p>我们使用 <code>sklearn.datasets.samples_generator</code> 包下的 <code>make_blobs()</code> 函数来生成数据集，这里生成60个训练样本，这些样本分布在 <code>centers</code> 参数指定的中心点的周围。<code>cluster_std</code> 是标准差，用来指明生成的点分布的松散程度。生成的训练数据集放在变量X里面，数据集的类别标记放在 y 里面。</p>
<pre class="line-numbers language-python"><code class="language-python"><span class="token keyword">from</span> sklearn<span class="token punctuation">.</span>datasets <span class="token keyword">import</span> make_blobs
<span class="token comment" spellcheck="true"># 生成数据</span>
centers <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token punctuation">[</span><span class="token operator">-</span><span class="token number">2</span><span class="token punctuation">,</span><span class="token number">2</span><span class="token punctuation">]</span><span class="token punctuation">,</span><span class="token punctuation">[</span><span class="token number">2</span><span class="token punctuation">,</span><span class="token number">2</span><span class="token punctuation">]</span><span class="token punctuation">,</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">,</span><span class="token number">4</span><span class="token punctuation">]</span><span class="token punctuation">]</span>
X<span class="token punctuation">,</span>y <span class="token operator">=</span> make_blobs<span class="token punctuation">(</span>n_samples<span class="token operator">=</span><span class="token number">60</span><span class="token punctuation">,</span>centers<span class="token operator">=</span>centers<span class="token punctuation">,</span>random_state<span class="token operator">=</span><span class="token number">0</span><span class="token punctuation">,</span>cluster_std<span class="token operator">=</span><span class="token number">0.60</span><span class="token punctuation">)</span><span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span><span></span></span></code></pre>
<blockquote>
<p>X:  [[ 1.59652373  1.7842681 ] ,[-1.08033247 2.88161526],…]共60个点的横纵坐标</p>
<p>y: [1 0 0 1 0 1 1 0 2…1 2 0 1] 共60个点的类别，用0，1，2分别表示以哪个中心聚合</p>
</blockquote>
<p>使用 <code>matplotlib</code> 库，它可以很容易地把生成的点画出来：</p>
<pre class="line-numbers language-python"><code class="language-python"><span class="token keyword">import</span> matplotlib<span class="token punctuation">.</span>pyplot <span class="token keyword">as</span> plt
<span class="token keyword">import</span> numpy <span class="token keyword">as</span> np
plt<span class="token punctuation">.</span>figure<span class="token punctuation">(</span>figsize<span class="token operator">=</span><span class="token punctuation">(</span><span class="token number">16</span><span class="token punctuation">,</span><span class="token number">10</span><span class="token punctuation">)</span><span class="token punctuation">,</span>dpi<span class="token operator">=</span><span class="token number">144</span><span class="token punctuation">)</span>
c <span class="token operator">=</span> np<span class="token punctuation">.</span>array<span class="token punctuation">(</span>centers<span class="token punctuation">)</span>
<span class="token comment" spellcheck="true"># cmap就是指matplotlib.colors.Colormap,一个包含三列矩阵的色彩映射表</span>
<span class="token comment" spellcheck="true"># 使用c和cmap来映射颜色，s为形状的大小</span>
plt<span class="token punctuation">.</span>scatter<span class="token punctuation">(</span>X<span class="token punctuation">[</span><span class="token punctuation">:</span><span class="token punctuation">,</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">,</span>X<span class="token punctuation">[</span><span class="token punctuation">:</span><span class="token punctuation">,</span><span class="token number">1</span><span class="token punctuation">]</span><span class="token punctuation">,</span>c<span class="token operator">=</span>y<span class="token punctuation">,</span>s<span class="token operator">=</span><span class="token number">100</span><span class="token punctuation">,</span>cmap<span class="token operator">=</span><span class="token string">'cool'</span><span class="token punctuation">)</span>
plt<span class="token punctuation">.</span>scatter<span class="token punctuation">(</span>c<span class="token punctuation">[</span><span class="token punctuation">:</span><span class="token punctuation">,</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">,</span>c<span class="token punctuation">[</span><span class="token punctuation">:</span><span class="token punctuation">,</span><span class="token number">1</span><span class="token punctuation">]</span><span class="token punctuation">,</span>s<span class="token operator">=</span><span class="token number">100</span><span class="token punctuation">,</span>marker<span class="token operator">=</span><span class="token string">'*'</span><span class="token punctuation">,</span>c<span class="token operator">=</span><span class="token string">'black'</span><span class="token punctuation">)</span>
plt<span class="token punctuation">.</span>show<span class="token punctuation">(</span><span class="token punctuation">)</span><span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span></span></code></pre>
<img src="/medias/loading.gif" data-original="https://cdn.jsdelivr.net/gh/dongzhougu/imageuse1/image-20200628191906466.png">

<p>这些点的分布情况在坐标轴上一目了然，其中五角星的点即各个类别的中心点。</p>
<h4 id="rainbow-训练算法"><a href="#rainbow-训练算法" class="headerlink" title=":rainbow: 训练算法"></a>:rainbow: 训练算法</h4><p>使用 <code>KNeighborsClassifier</code> 来对算法进行训练，我们选择的参数是 <code>K=5</code></p>
<pre class="line-numbers language-python"><code class="language-python"><span class="token keyword">from</span> sklearn<span class="token punctuation">.</span>neighbors <span class="token keyword">import</span> KNeighborsClassifier
k <span class="token operator">=</span> <span class="token number">5</span>
clf <span class="token operator">=</span> KNeighborsClassifier<span class="token punctuation">(</span>n_neighbors<span class="token operator">=</span>k<span class="token punctuation">)</span>
clf<span class="token punctuation">.</span>fit<span class="token punctuation">(</span>X<span class="token punctuation">,</span>y<span class="token punctuation">)</span><span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span><span></span></span></code></pre>
<p><code>KNeighborsClassifier</code>的参数细节为：</p>
<pre class="line-numbers language-python"><code class="language-python">KNeighborsClassifier<span class="token punctuation">(</span>algorithm<span class="token operator">=</span><span class="token string">'auto'</span><span class="token punctuation">,</span> leaf_size<span class="token operator">=</span><span class="token number">30</span><span class="token punctuation">,</span> metric<span class="token operator">=</span><span class="token string">'minkowski'</span><span class="token punctuation">,</span>
           metric_params<span class="token operator">=</span>None<span class="token punctuation">,</span> n_jobs<span class="token operator">=</span><span class="token number">1</span><span class="token punctuation">,</span> n_neighbors<span class="token operator">=</span><span class="token number">5</span><span class="token punctuation">,</span> p<span class="token operator">=</span><span class="token number">2</span><span class="token punctuation">,</span>
           weights<span class="token operator">=</span><span class="token string">'uniform'</span><span class="token punctuation">)</span><span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span></span></code></pre>
<h4 id="rainbow-对样本进行预测"><a href="#rainbow-对样本进行预测" class="headerlink" title=":rainbow:  对样本进行预测"></a>:rainbow:  对样本进行预测</h4><p>我们要预测的样本是[0,2]，使用 <code>kneighbors()</code> 方法，把这个样本周围距离最近的5个点取出来。取出来的点是训练样本X里的索引，从0开始计算。<br> 注意：<code>kneighbors()</code>接收一个二维数组作为参数，所以 <code>X_sample</code> 需要变成二维。</p>
<pre class="line-numbers language-python"><code class="language-python">X_sample <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">,</span><span class="token number">2</span><span class="token punctuation">]</span>
X_sample <span class="token operator">=</span> np<span class="token punctuation">.</span>array<span class="token punctuation">(</span>X_sample<span class="token punctuation">)</span><span class="token punctuation">.</span>reshape<span class="token punctuation">(</span><span class="token number">1</span><span class="token punctuation">,</span> <span class="token operator">-</span><span class="token number">1</span><span class="token punctuation">)</span>  <span class="token comment" spellcheck="true">#[[0 2]]</span>
y_sample <span class="token operator">=</span> clf<span class="token punctuation">.</span>predict<span class="token punctuation">(</span>X_sample<span class="token punctuation">)</span>
neighbors <span class="token operator">=</span> clf<span class="token punctuation">.</span>kneighbors<span class="token punctuation">(</span>X_sample<span class="token punctuation">,</span>return_distance<span class="token operator">=</span><span class="token boolean">False</span><span class="token punctuation">)</span> <span class="token comment" spellcheck="true">#[[16 20 48  6 23]]</span><span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span><span></span></span></code></pre>
<h4 id="rainbow-标记最近的5个点"><a href="#rainbow-标记最近的5个点" class="headerlink" title=":rainbow:  标记最近的5个点"></a>:rainbow:  标记最近的5个点</h4><p>把待预测的样本以及和其最近的5个点标记出来</p>
<pre class="line-numbers language-python"><code class="language-python">plt<span class="token punctuation">.</span>figure<span class="token punctuation">(</span>figsize<span class="token operator">=</span><span class="token punctuation">(</span><span class="token number">16</span><span class="token punctuation">,</span><span class="token number">10</span><span class="token punctuation">)</span><span class="token punctuation">,</span>dpi<span class="token operator">=</span><span class="token number">144</span><span class="token punctuation">)</span>
plt<span class="token punctuation">.</span>scatter<span class="token punctuation">(</span>X<span class="token punctuation">[</span><span class="token punctuation">:</span><span class="token punctuation">,</span> <span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">,</span> X<span class="token punctuation">[</span><span class="token punctuation">:</span><span class="token punctuation">,</span> <span class="token number">1</span><span class="token punctuation">]</span><span class="token punctuation">,</span> c<span class="token operator">=</span>y<span class="token punctuation">,</span> s<span class="token operator">=</span><span class="token number">100</span><span class="token punctuation">,</span> cmap<span class="token operator">=</span><span class="token string">'cool'</span><span class="token punctuation">)</span>    <span class="token comment" spellcheck="true"># 样本</span>
plt<span class="token punctuation">.</span>scatter<span class="token punctuation">(</span>c<span class="token punctuation">[</span><span class="token punctuation">:</span><span class="token punctuation">,</span> <span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">,</span> c<span class="token punctuation">[</span><span class="token punctuation">:</span><span class="token punctuation">,</span> <span class="token number">1</span><span class="token punctuation">]</span><span class="token punctuation">,</span> s<span class="token operator">=</span><span class="token number">100</span><span class="token punctuation">,</span> marker<span class="token operator">=</span><span class="token string">'^'</span><span class="token punctuation">,</span> c<span class="token operator">=</span><span class="token string">'k'</span><span class="token punctuation">)</span>   <span class="token comment" spellcheck="true"># 中心点</span>
plt<span class="token punctuation">.</span>scatter<span class="token punctuation">(</span>X_sample<span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">,</span>X_sample<span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">[</span><span class="token number">1</span><span class="token punctuation">]</span><span class="token punctuation">,</span>marker<span class="token operator">=</span><span class="token string">"x"</span><span class="token punctuation">,</span> s<span class="token operator">=</span><span class="token number">100</span><span class="token punctuation">,</span> cmap<span class="token operator">=</span><span class="token string">'cool'</span><span class="token punctuation">)</span>  <span class="token comment" spellcheck="true">#待预测的点</span>
<span class="token comment" spellcheck="true">#预测点与距离最近的5个样本的连线</span>
<span class="token keyword">for</span> i <span class="token keyword">in</span> neighbors<span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">:</span>
    plt<span class="token punctuation">.</span>plot<span class="token punctuation">(</span><span class="token punctuation">[</span>X<span class="token punctuation">[</span>i<span class="token punctuation">]</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">,</span>X_sample<span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">]</span><span class="token punctuation">,</span><span class="token punctuation">[</span>X<span class="token punctuation">[</span>i<span class="token punctuation">]</span><span class="token punctuation">[</span><span class="token number">1</span><span class="token punctuation">]</span><span class="token punctuation">,</span>X_sample<span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">[</span><span class="token number">1</span><span class="token punctuation">]</span><span class="token punctuation">]</span><span class="token punctuation">,</span><span class="token string">'k--'</span><span class="token punctuation">,</span>linewidth<span class="token operator">=</span><span class="token number">0.6</span><span class="token punctuation">)</span>
plt<span class="token punctuation">.</span>show<span class="token punctuation">(</span><span class="token punctuation">)</span><span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span></span></code></pre>
<p><img src="/medias/loading.gif" data-original="https://cdn.jsdelivr.net/gh/dongzhougu/imageuse1/image-20200628204257514.png" alt=""></p>
<h3 id="案例2-使用KNN算法进行回归拟合"><a href="#案例2-使用KNN算法进行回归拟合" class="headerlink" title="案例2: 使用KNN算法进行回归拟合"></a>案例2: 使用KNN算法进行回归拟合</h3><p>分类问题的预测值是离散的，我们也可以使用 KNN 算法对连续区间内的数值进行预测，即进行回归拟合。在<code>scikit-learn</code>里面，使用KNN算法进行回归拟合的实现是 <code>sklearn.neighbors.KNeighborsRegressor</code> 类。</p>
<h4 id="rainbow-生成数据集-1"><a href="#rainbow-生成数据集-1" class="headerlink" title=":rainbow:  生成数据集"></a>:rainbow:  生成数据集</h4><p>在余弦曲线的基础上加入了噪声：</p>
<pre class="line-numbers language-python"><code class="language-python"><span class="token keyword">import</span> numpy <span class="token keyword">as</span> np
n_dots <span class="token operator">=</span> <span class="token number">40</span>
<span class="token comment" spellcheck="true"># 生成40行1列的服从“0~5”均匀分布的随机样本</span>
X <span class="token operator">=</span> <span class="token number">5</span> <span class="token operator">*</span> np<span class="token punctuation">.</span>random<span class="token punctuation">.</span>rand<span class="token punctuation">(</span>n_dots<span class="token punctuation">,</span> <span class="token number">1</span><span class="token punctuation">)</span>
y <span class="token operator">=</span> np<span class="token punctuation">.</span>cos<span class="token punctuation">(</span>X<span class="token punctuation">)</span><span class="token punctuation">.</span>flatten<span class="token punctuation">(</span><span class="token punctuation">)</span>
<span class="token comment" spellcheck="true"># 生成40行1列的服从“-0.1~0.1”均匀分布的随机误差</span>
y <span class="token operator">+=</span> <span class="token number">0.2</span> <span class="token operator">*</span> np<span class="token punctuation">.</span>random<span class="token punctuation">.</span>rand<span class="token punctuation">(</span>n_dots<span class="token punctuation">)</span> <span class="token operator">-</span> <span class="token number">0.1</span><span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span><span></span><span></span><span></span><span></span></span></code></pre>
<h4 id="rainbow-训练算法-1"><a href="#rainbow-训练算法-1" class="headerlink" title=":rainbow:  训练算法"></a>:rainbow:  训练算法</h4><p>使用 <code>KNeighborsRegressor</code> 来训练模型：</p>
<pre class="line-numbers language-python"><code class="language-python"><span class="token keyword">from</span> sklearn<span class="token punctuation">.</span>neighbors <span class="token keyword">import</span> KNeighborsRegressor
k <span class="token operator">=</span> <span class="token number">5</span>
knn <span class="token operator">=</span> KNeighborsRegressor<span class="token punctuation">(</span>k<span class="token punctuation">)</span>
knn<span class="token punctuation">.</span>fit<span class="token punctuation">(</span>X<span class="token punctuation">,</span>y<span class="token punctuation">)</span><span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span><span></span></span></code></pre>
<p><code>KNeighborsRegressor</code>方法的参数细节为：</p>
<pre class="line-numbers language-python"><code class="language-python">KNeighborsRegressor<span class="token punctuation">(</span>algorithm<span class="token operator">=</span><span class="token string">'auto'</span><span class="token punctuation">,</span> leaf_size<span class="token operator">=</span><span class="token number">30</span><span class="token punctuation">,</span> metric<span class="token operator">=</span><span class="token string">'minkowski'</span><span class="token punctuation">,</span>
          metric_params<span class="token operator">=</span>None<span class="token punctuation">,</span> n_jobs<span class="token operator">=</span><span class="token number">1</span><span class="token punctuation">,</span> n_neighbors<span class="token operator">=</span><span class="token number">5</span><span class="token punctuation">,</span> p<span class="token operator">=</span><span class="token number">2</span><span class="token punctuation">,</span>
          weights<span class="token operator">=</span><span class="token string">'uniform'</span><span class="token punctuation">)</span><span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span></span></code></pre>
<p>可以使用 <code>score()</code>方法 计算拟合曲线对训练样本的拟合准确性：</p>
<pre class="line-numbers language-python"><code class="language-python">knn<span class="token punctuation">.</span>score<span class="token punctuation">(</span>X<span class="token punctuation">,</span>y<span class="token punctuation">)</span>
<span class="token number">0.9596828473009764</span><span aria-hidden="true" class="line-numbers-rows"><span></span><span></span></span></code></pre>
<h4 id="rainbow-回归拟合"><a href="#rainbow-回归拟合" class="headerlink" title=":rainbow:  回归拟合"></a>:rainbow:  回归拟合</h4><p> 一个常用的方法是，在X轴上的指定区域生成足够多的点，针对这些足够密集的点，使用训练出来的模型进行预测，得到预测值y_pred，然后在坐标轴上，把所有的预测点连接起来，这样就画出了拟合曲线。<br> 生成足够密集的点并进行预测：</p>
<pre class="line-numbers language-python"><code class="language-python">T <span class="token operator">=</span> np<span class="token punctuation">.</span>linspace<span class="token punctuation">(</span><span class="token number">0</span><span class="token punctuation">,</span><span class="token number">5</span><span class="token punctuation">,</span><span class="token number">500</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token punctuation">:</span><span class="token punctuation">,</span>np<span class="token punctuation">.</span>newaxis<span class="token punctuation">]</span>
y_pred <span class="token operator">=</span> knn<span class="token punctuation">.</span>predict<span class="token punctuation">(</span>T<span class="token punctuation">)</span><span aria-hidden="true" class="line-numbers-rows"><span></span><span></span></span></code></pre>
<p>把这些预测点连起来，构成拟合曲线：</p>
<pre class="line-numbers language-python"><code class="language-python">plt<span class="token punctuation">.</span>figure<span class="token punctuation">(</span>figsize<span class="token operator">=</span><span class="token punctuation">(</span><span class="token number">16</span><span class="token punctuation">,</span><span class="token number">10</span><span class="token punctuation">)</span><span class="token punctuation">,</span>dpi<span class="token operator">=</span><span class="token number">144</span><span class="token punctuation">)</span>
plt<span class="token punctuation">.</span>scatter<span class="token punctuation">(</span>X<span class="token punctuation">,</span>y<span class="token punctuation">,</span>c<span class="token operator">=</span><span class="token string">'g'</span><span class="token punctuation">,</span>label<span class="token operator">=</span><span class="token string">'data'</span><span class="token punctuation">,</span>s<span class="token operator">=</span><span class="token number">100</span><span class="token punctuation">)</span>
plt<span class="token punctuation">.</span>plot<span class="token punctuation">(</span>T<span class="token punctuation">,</span>y_pred<span class="token punctuation">,</span>c<span class="token operator">=</span><span class="token string">'k'</span><span class="token punctuation">,</span>label<span class="token operator">=</span><span class="token string">'prediction'</span><span class="token punctuation">,</span>lw<span class="token operator">=</span><span class="token number">4</span><span class="token punctuation">)</span>
plt<span class="token punctuation">.</span>axis<span class="token punctuation">(</span><span class="token string">'tight'</span><span class="token punctuation">)</span>
plt<span class="token punctuation">.</span>title<span class="token punctuation">(</span><span class="token string">'KNeighborsRegressor (k = %i)'</span> <span class="token operator">%</span> k<span class="token punctuation">)</span>
plt<span class="token punctuation">.</span>show<span class="token punctuation">(</span><span class="token punctuation">)</span><span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span><span></span><span></span><span></span></span></code></pre>
<p>最终生成的拟合曲线和训练样本数据如图，拟合出来确实和 cos 曲线相似。</p>
<img src="/medias/loading.gif" data-original="https://cdn.jsdelivr.net/gh/dongzhougu/imageuse1/image-20200628223010274.png" style="zoom: 67%;">



<h3 id="案例3-使用KNN算法预测糖尿病"><a href="#案例3-使用KNN算法预测糖尿病" class="headerlink" title="案例3: 使用KNN算法预测糖尿病"></a>案例3: 使用KNN算法预测糖尿病</h3><p>本节使用KNN算法及其变种，对Pima印第安人的糖尿病进行预测。数据来源<a href="">kaggle.com</a>大家可以自己去下载。也可以使用 <a href="./pima-indians-diabetes/diabetes.csv">仓库文件</a>。</p>
<h4 id="rainbow-加载数据集"><a href="#rainbow-加载数据集" class="headerlink" title=":rainbow:加载数据集"></a>:rainbow:加载数据集</h4><p>使用Pandas加载数据：</p>
<pre class="line-numbers language-python"><code class="language-python"><span class="token keyword">import</span> pandas <span class="token keyword">as</span> pd
data <span class="token operator">=</span> pd<span class="token punctuation">.</span>read_csv<span class="token punctuation">(</span><span class="token string">'./pima-indians-diabetes/diabetes.csv'</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span><span class="token string">'dataset shape {}'</span><span class="token punctuation">.</span>format<span class="token punctuation">(</span>data<span class="token punctuation">.</span>shape<span class="token punctuation">)</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>data<span class="token punctuation">.</span>head<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span><span></span></span></code></pre>
<p>输出如下：</p>
<pre class="line-numbers language-python"><code class="language-python">dataset shape <span class="token punctuation">(</span><span class="token number">768</span><span class="token punctuation">,</span> <span class="token number">9</span><span class="token punctuation">)</span>
Out<span class="token punctuation">[</span><span class="token number">23</span><span class="token punctuation">]</span><span class="token punctuation">:</span>
Pregnancies Glucose BloodPressure   SkinThickness   Insulin BMI DiabetesPedigreeFunction    Age Outcome
<span class="token number">0</span>   <span class="token number">6</span>   <span class="token number">148</span> <span class="token number">72</span>  <span class="token number">35</span>  <span class="token number">0</span>   <span class="token number">33.6</span>    <span class="token number">0.627</span>   <span class="token number">50</span>  <span class="token number">1</span>
<span class="token number">1</span>   <span class="token number">1</span>   <span class="token number">85</span>  <span class="token number">66</span>  <span class="token number">29</span>  <span class="token number">0</span>   <span class="token number">26.6</span>    <span class="token number">0.351</span>   <span class="token number">31</span>  <span class="token number">0</span>
<span class="token number">2</span>   <span class="token number">8</span>   <span class="token number">183</span> <span class="token number">64</span>  <span class="token number">0</span>   <span class="token number">0</span>   <span class="token number">23.3</span>    <span class="token number">0.672</span>   <span class="token number">32</span>  <span class="token number">1</span>
<span class="token number">3</span>   <span class="token number">1</span>   <span class="token number">89</span>  <span class="token number">66</span>  <span class="token number">23</span>  <span class="token number">94</span>  <span class="token number">28.1</span>    <span class="token number">0.167</span>   <span class="token number">21</span>  <span class="token number">0</span>
<span class="token number">4</span>   <span class="token number">0</span>   <span class="token number">137</span> <span class="token number">40</span>  <span class="token number">35</span>  <span class="token number">168</span> <span class="token number">43.1</span>    <span class="token number">2.288</span>   <span class="token number">33</span>  <span class="token number">1</span><span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span></span></code></pre>
<p>从打印出的信息可以看到，这个数据集一共有 768 个样本、8 个特征、1 个标签：</p>
<p><code>Pregnancies</code>：怀孕的次数</p>
<p><code>Glucose</code>：血浆葡萄糖浓度，采用 2 小时口服葡萄糖耐量试验测得</p>
<p><code>BloodPressure</code>：舒张压（毫米汞柱）</p>
<p><code>SkinThickness</code>：肱三头肌皮肤褶皱厚度（毫米）</p>
<p><code>Insulin</code>：两个小时血清胰岛素（ μU /毫升）</p>
<p><code>BMI</code>：身体质量指数，体重除以身高的平方</p>
<p><code>DiabetesPedigreeFunction</code>：糖尿病血统指数，糖尿病和家庭遗传相关</p>
<p><code>Age</code>：年龄</p>
<p><code>Outcome</code>：0表示没有糖尿病，1表示有糖尿病</p>
<p> 我们可以进一步观察数据集里的阳性和阴性样本的个数：</p>
<pre class="line-numbers language-python"><code class="language-python">data<span class="token punctuation">.</span>groupby<span class="token punctuation">(</span><span class="token string">'Outcome'</span><span class="token punctuation">)</span><span class="token punctuation">.</span>size<span class="token punctuation">(</span><span class="token punctuation">)</span><span aria-hidden="true" class="line-numbers-rows"><span></span></span></code></pre>
<p>输出为：</p>
<pre class="line-numbers language-python"><code class="language-python">Outcome
<span class="token number">0</span>    <span class="token number">500</span>
<span class="token number">1</span>    <span class="token number">268</span>
dtype<span class="token punctuation">:</span> int64<span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span><span></span></span></code></pre>
<p>其中，阴性样本500例，阳性样本268例。</p>
<h4 id="rainbow-处理数据集"><a href="#rainbow-处理数据集" class="headerlink" title=":rainbow: 处理数据集"></a>:rainbow: 处理数据集</h4><p> 接着需要对数据集进行简单处理，把8个特征值分离出来，作为训练数据集，把Outcome列分离出来作为目标值。然后，把数据集划分为训练数据集和测试数据集。</p>
<pre class="line-numbers language-python"><code class="language-python">X <span class="token operator">=</span> data<span class="token punctuation">.</span>iloc<span class="token punctuation">[</span><span class="token punctuation">:</span><span class="token punctuation">,</span><span class="token punctuation">:</span><span class="token number">8</span><span class="token punctuation">]</span>
Y <span class="token operator">=</span> data<span class="token punctuation">.</span>iloc<span class="token punctuation">[</span><span class="token punctuation">:</span><span class="token punctuation">,</span><span class="token number">8</span><span class="token punctuation">]</span>
<span class="token keyword">from</span> sklearn<span class="token punctuation">.</span>model_selection <span class="token keyword">import</span> train_test_split
X_train<span class="token punctuation">,</span>X_test<span class="token punctuation">,</span>Y_train<span class="token punctuation">,</span>Y_test<span class="token operator">=</span>train_test_split<span class="token punctuation">(</span>X<span class="token punctuation">,</span>Y<span class="token punctuation">,</span>test_size<span class="token operator">=</span><span class="token number">0.2</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>X_train<span class="token punctuation">.</span>shape<span class="token punctuation">,</span> X_test<span class="token punctuation">.</span>shape<span class="token punctuation">,</span> Y_train<span class="token punctuation">.</span>shape<span class="token punctuation">,</span> Y_test<span class="token punctuation">.</span>shape<span class="token punctuation">)</span><span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span><span></span><span></span></span></code></pre>
<p>输出：</p>
<pre class="line-numbers language-python"><code class="language-python"><span class="token punctuation">(</span><span class="token number">614</span><span class="token punctuation">,</span> <span class="token number">8</span><span class="token punctuation">)</span> <span class="token punctuation">(</span><span class="token number">154</span><span class="token punctuation">,</span> <span class="token number">8</span><span class="token punctuation">)</span> <span class="token punctuation">(</span><span class="token number">614</span><span class="token punctuation">,</span><span class="token punctuation">)</span> <span class="token punctuation">(</span><span class="token number">154</span><span class="token punctuation">,</span><span class="token punctuation">)</span><span aria-hidden="true" class="line-numbers-rows"><span></span></span></code></pre>
<h4 id="rainbow-模型比较"><a href="#rainbow-模型比较" class="headerlink" title=":rainbow: 模型比较"></a>:rainbow: 模型比较</h4><p>分别使用普通的KNN算法、带权重的KNN算法和指定半径的KNN算法对数据集进行拟合并计算评分：</p>
<pre class="line-numbers language-python"><code class="language-python"><span class="token keyword">from</span> sklearn<span class="token punctuation">.</span>neighbors <span class="token keyword">import</span> KNeighborsClassifier<span class="token punctuation">,</span> RadiusNeighborsClassifier

models <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token punctuation">]</span>
models<span class="token punctuation">.</span>append<span class="token punctuation">(</span><span class="token punctuation">(</span><span class="token string">"KNN"</span><span class="token punctuation">,</span> KNeighborsClassifier<span class="token punctuation">(</span>n_neighbors<span class="token operator">=</span><span class="token number">2</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
models<span class="token punctuation">.</span>append<span class="token punctuation">(</span><span class="token punctuation">(</span><span class="token string">"KNN with weights"</span><span class="token punctuation">,</span> KNeighborsClassifier<span class="token punctuation">(</span>
    n_neighbors<span class="token operator">=</span><span class="token number">2</span><span class="token punctuation">,</span> weights<span class="token operator">=</span><span class="token string">"distance"</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
models<span class="token punctuation">.</span>append<span class="token punctuation">(</span><span class="token punctuation">(</span><span class="token string">"Radius Neighbors"</span><span class="token punctuation">,</span> RadiusNeighborsClassifier<span class="token punctuation">(</span>
    n_neighbors<span class="token operator">=</span><span class="token number">2</span><span class="token punctuation">,</span> radius<span class="token operator">=</span><span class="token number">500.0</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">)</span>

results <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token punctuation">]</span>
<span class="token keyword">for</span> name<span class="token punctuation">,</span> model <span class="token keyword">in</span> models<span class="token punctuation">:</span>
    model<span class="token punctuation">.</span>fit<span class="token punctuation">(</span>X_train<span class="token punctuation">,</span> Y_train<span class="token punctuation">)</span>
    results<span class="token punctuation">.</span>append<span class="token punctuation">(</span><span class="token punctuation">(</span>name<span class="token punctuation">,</span> model<span class="token punctuation">.</span>score<span class="token punctuation">(</span>X_test<span class="token punctuation">,</span> Y_test<span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
<span class="token keyword">for</span> i <span class="token keyword">in</span> range<span class="token punctuation">(</span>len<span class="token punctuation">(</span>results<span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">:</span>
    <span class="token keyword">print</span><span class="token punctuation">(</span><span class="token string">"name: {}; score: {}"</span><span class="token punctuation">.</span>format<span class="token punctuation">(</span>results<span class="token punctuation">[</span>i<span class="token punctuation">]</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">,</span>results<span class="token punctuation">[</span>i<span class="token punctuation">]</span><span class="token punctuation">[</span><span class="token number">1</span><span class="token punctuation">]</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span></span></code></pre>
<p>三种算法的性能如下：</p>
<pre class="line-numbers language-python"><code class="language-python">name<span class="token punctuation">:</span> KNN<span class="token punctuation">;</span> score<span class="token punctuation">:</span> <span class="token number">0.7467532467532467</span>
name<span class="token punctuation">:</span> KNN <span class="token keyword">with</span> weights<span class="token punctuation">;</span> score<span class="token punctuation">:</span> <span class="token number">0.6818181818181818</span>
name<span class="token punctuation">:</span> Radius Neighbors<span class="token punctuation">;</span> score<span class="token punctuation">:</span> <span class="token number">0.6558441558441559</span><span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span></span></code></pre>
<p>带权重的KNN算法，我们选择了距离越近、权重越高。指定半径的KNN算法的半径选择了500。从上面的输出结果可以看出，普通的KNN算法性能最好。问题来了，这个判断准确么？答案是不准确。因为我们的训练样本和测试样本是随机分配的，不同的训练样本和测试样本组合可能导致计算出来的算法准确性是有差异的。我们可以试着多次运行上面的代码，观察输出值是否有变化。</p>
<p>怎么样更准确地对比算法准确性呢？一个方法是，多次随机分配训练数据集和交叉验证数据集，然后求模型准确性评分的平均值。所幸，我们不需要从头实现这个过程，<code>scikit-learn</code> 提供了 <code>KFold</code> 和 <code>cross_val_score()</code>函数来处理这种问题：</p>
<pre class="line-numbers language-python"><code class="language-python"><span class="token keyword">from</span> sklearn<span class="token punctuation">.</span>model_selection <span class="token keyword">import</span> KFold
<span class="token keyword">from</span> sklearn<span class="token punctuation">.</span>model_selection <span class="token keyword">import</span> cross_val_score
results <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token punctuation">]</span>
<span class="token keyword">for</span> name<span class="token punctuation">,</span> model <span class="token keyword">in</span> models<span class="token punctuation">:</span>
    kfold <span class="token operator">=</span> KFold<span class="token punctuation">(</span>n_splits<span class="token operator">=</span><span class="token number">10</span><span class="token punctuation">)</span>
    cv_result <span class="token operator">=</span> cross_val_score<span class="token punctuation">(</span>model<span class="token punctuation">,</span> X<span class="token punctuation">,</span> Y<span class="token punctuation">,</span> cv<span class="token operator">=</span>kfold<span class="token punctuation">)</span>
    results<span class="token punctuation">.</span>append<span class="token punctuation">(</span><span class="token punctuation">(</span>name<span class="token punctuation">,</span> cv_result<span class="token punctuation">)</span><span class="token punctuation">)</span>
<span class="token keyword">for</span> i <span class="token keyword">in</span> range<span class="token punctuation">(</span>len<span class="token punctuation">(</span>results<span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">:</span>
    <span class="token keyword">print</span><span class="token punctuation">(</span><span class="token string">"name: {}; cross val score: {}"</span><span class="token punctuation">.</span>format<span class="token punctuation">(</span>
        results<span class="token punctuation">[</span>i<span class="token punctuation">]</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">,</span>results<span class="token punctuation">[</span>i<span class="token punctuation">]</span><span class="token punctuation">[</span><span class="token number">1</span><span class="token punctuation">]</span><span class="token punctuation">.</span>mean<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span></span></code></pre>
<p>上述代码中，我们通过KFold把数据集分成10份，其中1份会作为交叉验证数据集来计算模型准确性，剩余的9份作为训练数据集。cross_val_score()函数总共计算出10次不同训练数据集和交叉验证数据集组合得到的模型准确性评分，最后求平均值。这样的评价结果相对更准确一些。<br> 输出结果为：</p>
<pre class="line-numbers language-python"><code class="language-python">name<span class="token punctuation">:</span> KNN<span class="token punctuation">;</span> cross val score<span class="token punctuation">:</span> <span class="token number">0.7147641831852358</span>
name<span class="token punctuation">:</span> KNN <span class="token keyword">with</span> weights<span class="token punctuation">;</span> cross val score<span class="token punctuation">:</span> <span class="token number">0.6770505809979495</span>
name<span class="token punctuation">:</span> Radius Neighbors<span class="token punctuation">;</span> cross val score<span class="token punctuation">:</span> <span class="token number">0.6497265892002735</span><span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span></span></code></pre>
<h4 id="rainbow-模型训练及分析"><a href="#rainbow-模型训练及分析" class="headerlink" title=":rainbow:模型训练及分析"></a>:rainbow:模型训练及分析</h4><p>通过上面的对比来看，普通的KNN算法性能更优一些。接下来，我们就使用普通的KNN算法模型对数据集进行训练，并查看对训练样本的拟合情况以及对测试样本的预测准确性情况：</p>
<pre class="line-numbers language-python"><code class="language-python">knn <span class="token operator">=</span> KNeighborsClassifier<span class="token punctuation">(</span>n_neighbors<span class="token operator">=</span><span class="token number">2</span><span class="token punctuation">)</span>
knn<span class="token punctuation">.</span>fit<span class="token punctuation">(</span>X_train<span class="token punctuation">,</span> Y_train<span class="token punctuation">)</span>
train_score <span class="token operator">=</span> knn<span class="token punctuation">.</span>score<span class="token punctuation">(</span>X_train<span class="token punctuation">,</span> Y_train<span class="token punctuation">)</span>
test_score <span class="token operator">=</span> knn<span class="token punctuation">.</span>score<span class="token punctuation">(</span>X_test<span class="token punctuation">,</span> Y_test<span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span><span class="token string">"train score: {}\ntest score: {}"</span><span class="token punctuation">.</span>format<span class="token punctuation">(</span>train_score<span class="token punctuation">,</span> test_score<span class="token punctuation">)</span><span class="token punctuation">)</span><span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span><span></span><span></span></span></code></pre>
<p>输出结果为：</p>
<pre class="line-numbers language-python"><code class="language-python">train score<span class="token punctuation">:</span> <span class="token number">0.8387622149837134</span>
test score<span class="token punctuation">:</span> <span class="token number">0.7337662337662337</span><span aria-hidden="true" class="line-numbers-rows"><span></span><span></span></span></code></pre>
<p>从输出中可以看到两个问题。一是对训练样本的拟合情况不佳，评分才0.82多，这说明算法模型太简单了，无法很好地拟合训练样本。二是模型的准确性欠佳，不到74%的预测准确性。我们可以进一步画出学习曲线，证实结论。</p>
<pre class="line-numbers language-python"><code class="language-python"><span class="token keyword">from</span> sklearn<span class="token punctuation">.</span>model_selection <span class="token keyword">import</span> ShuffleSplit
<span class="token keyword">from</span> common<span class="token punctuation">.</span>utils <span class="token keyword">import</span> plot_learning_curve
knn <span class="token operator">=</span> KNeighborsClassifier<span class="token punctuation">(</span>n_neighbors<span class="token operator">=</span><span class="token number">2</span><span class="token punctuation">)</span>
cv <span class="token operator">=</span> ShuffleSplit<span class="token punctuation">(</span>n_splits<span class="token operator">=</span><span class="token number">10</span><span class="token punctuation">,</span> test_size<span class="token operator">=</span><span class="token number">0.2</span><span class="token punctuation">,</span> random_state<span class="token operator">=</span><span class="token number">0</span><span class="token punctuation">)</span>
plt<span class="token punctuation">.</span>figure<span class="token punctuation">(</span>figsize<span class="token operator">=</span><span class="token punctuation">(</span><span class="token number">10</span><span class="token punctuation">,</span> <span class="token number">6</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
plot_learning_curve<span class="token punctuation">(</span>plt<span class="token punctuation">,</span> knn<span class="token punctuation">,</span> <span class="token string">"Learn Curve for KNN Diabetes"</span><span class="token punctuation">,</span> 
                    X<span class="token punctuation">,</span> Y<span class="token punctuation">,</span> ylim<span class="token operator">=</span><span class="token punctuation">(</span><span class="token number">0.0</span><span class="token punctuation">,</span> <span class="token number">1.01</span><span class="token punctuation">)</span><span class="token punctuation">,</span> cv<span class="token operator">=</span>cv<span class="token punctuation">)</span>
plt<span class="token punctuation">.</span>show<span class="token punctuation">(</span><span class="token punctuation">)</span><span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span></span></code></pre>
<img src="/medias/loading.gif" data-original="https://cdn.jsdelivr.net/gh/dongzhougu/imageuse1/image-20200629101150911.png" style="zoom:67%;">

<p>从图中可以看出来，训练样本评分较低，且测试样本与训练样本距离较大，这是典型的欠拟合现象。KNN算法没有更好的措施来解决欠拟合问题，我们学完本书的其他章节后，可以试着用其他算法（如逻辑回归算法、支持向量机等）来对比不同模型的准确性情况。</p>
<h4 id="rainbow-特征选择及数据可视化"><a href="#rainbow-特征选择及数据可视化" class="headerlink" title=":rainbow:特征选择及数据可视化"></a>:rainbow:特征选择及数据可视化</h4><p>那有没有直观的方法，来揭示出为什么KNN算法不是针对这一问题的好模型？一个办法是把数据画出来，可是我们有8个特征，无法在这么高的维度里画出数据，并直观地观察。一个解决办法是特征选择，即只选择2个与输出值相关性最大的特征，这样就可以在二维平面上画出输入特征值与输出值的关系了。</p>
<p><code>scikit-learn</code>在 <code>sklearn.feature_selection</code> 包里提供了丰富的特征选择方法。我们使用 <code>SelectKBest</code> 来选择相关性最大的两个特征：</p>
<pre class="line-numbers language-python"><code class="language-python"><span class="token keyword">from</span> sklearn<span class="token punctuation">.</span>feature_selection <span class="token keyword">import</span> SelectKBest
selector <span class="token operator">=</span> SelectKBest<span class="token punctuation">(</span>k<span class="token operator">=</span><span class="token number">2</span><span class="token punctuation">)</span>
X_new <span class="token operator">=</span> selector<span class="token punctuation">.</span>fit_transform<span class="token punctuation">(</span>X<span class="token punctuation">,</span> Y<span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span><span class="token string">'X_new.shape {}'</span><span class="token punctuation">.</span>format<span class="token punctuation">(</span>X_new<span class="token punctuation">.</span>shape<span class="token punctuation">)</span><span class="token punctuation">)</span><span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span><span></span></span></code></pre>
<p>把相关性最大的两个特征放在X_new变量里，输出结果为：</p>
<pre class="line-numbers language-python"><code class="language-python">X_new<span class="token punctuation">.</span>shape <span class="token punctuation">(</span><span class="token number">768</span><span class="token punctuation">,</span> <span class="token number">2</span><span class="token punctuation">)</span><span aria-hidden="true" class="line-numbers-rows"><span></span></span></code></pre>
<p>我们可能会好奇，相关性最大的特征到底是哪两个？对比一下本节开头的数据即可知道，它们分别是Glucose（血糖浓度）和BMI（身体质量指数）。血糖浓度和糖尿病的关系自不必说，身体质量指数是反映肥胖程度的指标，从业务角度来看，我们选择出来的2个相关性最高的特征还算合理。那么 <code>SelectKBest</code> 到底使用什么神奇的方法选择出了这两个相关性最高的特征呢？详情参考下一节。</p>
<p>我们来看看，如果只使用这2个相关性最高的特征的话，3种不同的KNN算法哪个准确性更高：</p>
<pre class="line-numbers language-python"><code class="language-python">results <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token punctuation">]</span>
<span class="token keyword">for</span> name<span class="token punctuation">,</span> model <span class="token keyword">in</span> models<span class="token punctuation">:</span>
    kfold <span class="token operator">=</span> KFold<span class="token punctuation">(</span>n_splits<span class="token operator">=</span><span class="token number">10</span><span class="token punctuation">)</span>
    cv_result <span class="token operator">=</span> cross_val_score<span class="token punctuation">(</span>model<span class="token punctuation">,</span> X_new<span class="token punctuation">,</span> Y<span class="token punctuation">,</span> cv<span class="token operator">=</span>kfold<span class="token punctuation">)</span>
    results<span class="token punctuation">.</span>append<span class="token punctuation">(</span><span class="token punctuation">(</span>name<span class="token punctuation">,</span> cv_result<span class="token punctuation">)</span><span class="token punctuation">)</span>
<span class="token keyword">for</span> i <span class="token keyword">in</span> range<span class="token punctuation">(</span>len<span class="token punctuation">(</span>results<span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">:</span>
    <span class="token keyword">print</span><span class="token punctuation">(</span><span class="token string">"name: {}; cross val score: {}"</span><span class="token punctuation">.</span>format<span class="token punctuation">(</span>
        results<span class="token punctuation">[</span>i<span class="token punctuation">]</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">,</span>results<span class="token punctuation">[</span>i<span class="token punctuation">]</span><span class="token punctuation">[</span><span class="token number">1</span><span class="token punctuation">]</span><span class="token punctuation">.</span>mean<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span></span></code></pre>
<p>这次使用X_new作为输入，输出如下</p>
<pre class="line-numbers language-python"><code class="language-python">name<span class="token punctuation">:</span> KNN<span class="token punctuation">;</span> cross val score<span class="token punctuation">:</span> <span class="token number">0.725205058099795</span>
name<span class="token punctuation">:</span> KNN <span class="token keyword">with</span> weights<span class="token punctuation">;</span> cross val score<span class="token punctuation">:</span> <span class="token number">0.6900375939849623</span>
name<span class="token punctuation">:</span> Radius Neighbors<span class="token punctuation">;</span> cross val score<span class="token punctuation">:</span> <span class="token number">0.6510252904989747</span><span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span></span></code></pre>
<p>从输出可以看出来，还是普通的KNN模型准确性较高，其准确性也达到了将近 73 %，与所有特征拿来一块儿训练的准确性差不多。这也侧面证明了 <code>SelectKBest</code> 特征选择的有效性。</p>
<p>回到目标上来，我们是想看看为什么KNN无法很好地拟合训练样本。现在我们只有 2 个特征，可以很方便地在二维坐标上画出所有的训练样本，观察这些数据的分布情况：</p>
<pre class="line-numbers language-python"><code class="language-python">plt<span class="token punctuation">.</span>figure<span class="token punctuation">(</span>figsize<span class="token operator">=</span><span class="token punctuation">(</span><span class="token number">10</span><span class="token punctuation">,</span> <span class="token number">6</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
plt<span class="token punctuation">.</span>ylabel<span class="token punctuation">(</span><span class="token string">"BMI"</span><span class="token punctuation">)</span>
plt<span class="token punctuation">.</span>xlabel<span class="token punctuation">(</span><span class="token string">"Glucose"</span><span class="token punctuation">)</span>
plt<span class="token punctuation">.</span>scatter<span class="token punctuation">(</span>X_new<span class="token punctuation">[</span>Y<span class="token operator">==</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">[</span><span class="token punctuation">:</span><span class="token punctuation">,</span> <span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">,</span> X_new<span class="token punctuation">[</span>Y<span class="token operator">==</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">[</span><span class="token punctuation">:</span><span class="token punctuation">,</span> <span class="token number">1</span><span class="token punctuation">]</span><span class="token punctuation">,</span> c<span class="token operator">=</span><span class="token string">'r'</span><span class="token punctuation">,</span> s<span class="token operator">=</span><span class="token number">20</span><span class="token punctuation">,</span> marker<span class="token operator">=</span><span class="token string">'o'</span><span class="token punctuation">)</span><span class="token punctuation">;</span>   <span class="token comment" spellcheck="true">#画出样本</span>
plt<span class="token punctuation">.</span>scatter<span class="token punctuation">(</span>X_new<span class="token punctuation">[</span>Y<span class="token operator">==</span><span class="token number">1</span><span class="token punctuation">]</span><span class="token punctuation">[</span><span class="token punctuation">:</span><span class="token punctuation">,</span> <span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">,</span> X_new<span class="token punctuation">[</span>Y<span class="token operator">==</span><span class="token number">1</span><span class="token punctuation">]</span><span class="token punctuation">[</span><span class="token punctuation">:</span><span class="token punctuation">,</span> <span class="token number">1</span><span class="token punctuation">]</span><span class="token punctuation">,</span> c<span class="token operator">=</span><span class="token string">'g'</span><span class="token punctuation">,</span> s<span class="token operator">=</span><span class="token number">20</span><span class="token punctuation">,</span> marker<span class="token operator">=</span><span class="token string">'^'</span><span class="token punctuation">)</span><span class="token punctuation">;</span>   <span class="token comment" spellcheck="true">#画出样本</span><span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span><span></span><span></span></span></code></pre>
<img src="/medias/loading.gif" data-original="https://cdn.jsdelivr.net/gh/dongzhougu/imageuse1/image-20200629102028938.png" style="zoom:67%;">

<p>横坐标是血糖值 Glucose，纵坐标是BMI值，反映身体肥胖情况。从图中可以看出，在中间数据集密集的区域，阳性样本和阴性样本几乎重叠在一起了。假设现在有一个待预测的样本在中间密集区域，它的阳性邻居多还是阴性邻居多呢？这真的很难说。这样就可以直观地看到，KNN算法在这个糖尿病预测问题上，无法达到很高的预测准确性。</p>
<h3 id="拓展阅读"><a href="#拓展阅读" class="headerlink" title="拓展阅读"></a>拓展阅读</h3><p>这里再继续再介绍一下特征选择时，计算相关性大小的 <code>SelectKBest()</code> 函数背后的统计学知识。</p>
<h4 id="如何提高KNN算法的运算效率"><a href="#如何提高KNN算法的运算效率" class="headerlink" title="如何提高KNN算法的运算效率"></a>如何提高KNN算法的运算效率</h4><p>根据算法原理，每次需要预测一个点时，我们都需要计算训练数据集里每个点到这个点的距离，然后选出距离最近的k个点进行投票。当数据集很大时，这个计算成本非常高。针对$N$个样本，$D$个特征的数据集，其算法复杂度为$O(DN^2)$。</p>
<p>为了解决这个问题，一种叫<code>K-D Tree</code> 的数据结构被发明出来。为了避免每次都重新计算一遍距离，算法会把距离信息保存在一棵树里，这样在计算之前从树里查询距离信息，尽量避免重新计算。其基本原理是，如果A和B距离很远，B和C距离很近，那么A和C的距离也很远。有了这个信息，就可以在合适的时候跳过距离远的点。这样优化后的算法复杂度可降低到$O(DNlog(N))$。感兴趣的读者可参阅论文：Bentley, J.L., Communications of the ACM (1975)。</p>
<p>1989年，另外一种称为<code>Ball Tree</code>的算法，在<code>K-D Tree</code>的基础上对性能进一步进行了优化。感兴趣的读者可以搜索Five balltree construction algorithms来了解详细的算法信息。</p>
<h4 id="相关性测试"><a href="#相关性测试" class="headerlink" title="相关性测试"></a>相关性测试</h4><p>先通过一个简单的例子来看假设检验问题，即判断假设的结论是否成立或成立的概率有多高。假设，在一个城市随机采样到程序员和性别的关系的数据：</p>
<img src="/medias/loading.gif" data-original="https://cdn.jsdelivr.net/gh/dongzhougu/imageuse1/17634123-0c82f9f4005936fa.png" style="zoom:50%;">

<p>假设，我们的结论是程序员和性别无关，这个假设称为原假设（null hypothesis）。问：通过我们随机采样观测到的数据，原假设是否成立，或者说原假设成立的概率有多高？</p>
<p><code>卡方检验（chi-squared test）</code>是检测假设成立与否的一个常用的工具。它的计算公式是：</p>
<img src="/medias/loading.gif" data-original="https://cdn.jsdelivr.net/gh/dongzhougu/imageuse1/17634123-3a52b648ce9e2196.png" style="zoom:50%;">

<p>其中，卡方检验的值记为 ,  $O$ 是观测值，$E$  是期望值。针对我们的例子，如果原假设成立，即程序员职业和性别无关，那么我们期望的男程序员数量应该为(14/489) * 242=6.928，女程序员数量应该为(14/489) * 247=7.072，同理可得到我们的期望值如下：</p>
<img src="/medias/loading.gif" data-original="https://cdn.jsdelivr.net/gh/dongzhougu/imageuse1/17634123-9f2c88dc6a7eeb66.png" style="zoom:50%;">


<p> 根据卡方检验的公式，可以算出卡方值为：</p>
<img src="/medias/loading.gif" data-original="https://cdn.jsdelivr.net/gh/dongzhougu/imageuse1/17634123-bf0dbd2fd6c8db0c.png" style="zoom:50%;">

<p> 算出卡方值后，怎么判断原假设成立的概率是多少呢？这里还涉及到自由度和卡方分布的概念。简单地讲，自由度是$(r-1)×(c-1)$，其中 r 是行数，c 是列数，针对我们的问题，其自由度为1。卡方分布是指，若n个相互独立的随机变量均服从正态分布，则这 n 个随机变量的平方和构成一新的随机变量，其分布规律称为卡方分布。卡方分布的密度函数和自由度相关，知道了自由度和目标概率，我们就能求出卡方值。</p>
<p>针对我们的问题，可以查表得到，自由度为1的卡方分布，在99%处的卡方值为6.63。我们计算出来的卡方值为7.670。由于7.67&gt;6.63，故有99%的把握可以推翻原假设。换个说法，如果原假设成立，即程序员职业和性别无关，那么我们随机采样到的数据出现的概率将低于1%。我们可以搜索<code>“卡方表”</code>或<code>“Chi Squared Table”</code>找到不同自由度对应的卡方值。</p>
<p>卡方值的大小可以反映变量与目标值的相关性，值越大，相关性越大。利用这一特性，<code>SelectKBest()</code> 函数就可以计算不同特征的卡方值来判断特征与输出值的相关性大小，从而完成特征选择。在<code>scikit-learn</code>里，计算卡方值的函数是 <code>sklearn.feature_selection.chi2()</code>。除了卡方检验外，还有<code>F值检验</code>等算法，也可以用来评估特征与目标值的相关性。<code>SelectKBest</code> 默认使用的就是F值检验算法，在<code>scikit-learn</code>里，使用<code>sklearn.feature_selection.f_classif</code>来计算F值。关于F值相关的资料，感兴趣的读者可以在英文版维基百科上搜索<code>“Fisher’sexact test”</code>，了解更多信息。</p>
<script>
        document.querySelectorAll('.github-emoji')
          .forEach(el => {
            if (!el.dataset.src) { return; }
            const img = document.createElement('img');
            img.style = 'display:none !important;';
            img.src = el.dataset.src;
            img.addEventListener('error', () => {
              img.remove();
              el.style.color = 'inherit';
              el.style.backgroundImage = 'none';
              el.style.background = 'none';
            });
            img.addEventListener('load', () => {
              img.remove();
            });
            document.body.appendChild(img);
          });
      </script>
            </div>
            <hr/>

            

    <div class="reprint" id="reprint-statement">
        
            <div class="reprint__author">
                <span class="reprint-meta" style="font-weight: bold;">
                    <i class="fas fa-user">
                        文章作者:
                    </i>
                </span>
                <span class="reprint-info">
                    <a href="/about" rel="external nofollow noreferrer">DongZhou</a>
                </span>
            </div>
            <div class="reprint__type">
                <span class="reprint-meta" style="font-weight: bold;">
                    <i class="fas fa-link">
                        文章链接:
                    </i>
                </span>
                <span class="reprint-info">
                    <a href="https://dongzhougu.github.io/2020/06/29/scikit-learn-xi-lie-er-k-jin-lin/">https://dongzhougu.github.io/2020/06/29/scikit-learn-xi-lie-er-k-jin-lin/</a>
                </span>
            </div>
            <div class="reprint__notice">
                <span class="reprint-meta" style="font-weight: bold;">
                    <i class="fas fa-copyright">
                        版权声明:
                    </i>
                </span>
                <span class="reprint-info">
                    本博客所有文章除特別声明外，均采用
                    <a href="https://creativecommons.org/licenses/by/4.0/deed.zh" rel="external nofollow noreferrer" target="_blank">CC BY 4.0</a>
                    许可协议。转载请注明来源
                    <a href="/about" target="_blank">DongZhou</a>
                    !
                </span>
            </div>
        
    </div>

    <script async defer>
      document.addEventListener("copy", function (e) {
        let toastHTML = '<span>复制成功，请遵循本文的转载规则</span><button class="btn-flat toast-action" onclick="navToReprintStatement()" style="font-size: smaller">查看</a>';
        M.toast({html: toastHTML})
      });

      function navToReprintStatement() {
        $("html, body").animate({scrollTop: $("#reprint-statement").offset().top - 80}, 800);
      }
    </script>



            <div class="tag_share" style="display: block;">
                <div class="post-meta__tag-list" style="display: inline-block;">
                    
                        <div class="article-tag">
                            
                                <a href="/tags/%E5%9F%BA%E7%A1%80%E7%9F%A5%E8%AF%86/">
                                    <span class="chip bg-color">基础知识</span>
                                </a>
                            
                                <a href="/tags/ML%E7%AE%97%E6%B3%95/">
                                    <span class="chip bg-color">ML算法</span>
                                </a>
                            
                        </div>
                    
                </div>
                <div class="post_share" style="zoom: 80%; width: fit-content; display: inline-block; float: right; margin: -0.15rem 0;">
                    <link rel="stylesheet" type="text/css" href="/libs/share/css/share.min.css">
<div id="article-share">

    
    <div class="social-share" data-sites="google,qq,qzone,wechat,weibo,douban,linkedin" data-wechat-qrcode-helper="<p>微信扫一扫即可分享！</p>"></div>
    <script src="/libs/share/js/social-share.min.js"></script>
    

    

</div>

                </div>
            </div>
            
                <style>
    #reward {
        margin: 40px 0;
        text-align: center;
    }

    #reward .reward-link {
        font-size: 1.4rem;
        line-height: 38px;
    }

    #reward .btn-floating:hover {
        box-shadow: 0 6px 12px rgba(0, 0, 0, 0.2), 0 5px 15px rgba(0, 0, 0, 0.2);
    }

    #rewardModal {
        width: 320px;
        height: 350px;
    }

    #rewardModal .reward-title {
        margin: 15px auto;
        padding-bottom: 5px;
    }

    #rewardModal .modal-content {
        padding: 10px;
    }

    #rewardModal .close {
        position: absolute;
        right: 15px;
        top: 15px;
        color: rgba(0, 0, 0, 0.5);
        font-size: 1.3rem;
        line-height: 20px;
        cursor: pointer;
    }

    #rewardModal .close:hover {
        color: #ef5350;
        transform: scale(1.3);
        -moz-transform:scale(1.3);
        -webkit-transform:scale(1.3);
        -o-transform:scale(1.3);
    }

    #rewardModal .reward-tabs {
        margin: 0 auto;
        width: 210px;
    }

    .reward-tabs .tabs {
        height: 38px;
        margin: 10px auto;
        padding-left: 0;
    }

    .reward-content ul {
        padding-left: 0 !important;
    }

    .reward-tabs .tabs .tab {
        height: 38px;
        line-height: 38px;
    }

    .reward-tabs .tab a {
        color: #fff;
        background-color: #ccc;
    }

    .reward-tabs .tab a:hover {
        background-color: #ccc;
        color: #fff;
    }

    .reward-tabs .wechat-tab .active {
        color: #fff !important;
        background-color: #22AB38 !important;
    }

    .reward-tabs .alipay-tab .active {
        color: #fff !important;
        background-color: #019FE8 !important;
    }

    .reward-tabs .reward-img {
        width: 210px;
        height: 210px;
    }
</style>

<div id="reward">
    <a href="#rewardModal" class="reward-link modal-trigger btn-floating btn-medium waves-effect waves-light red">赏</a>

    <!-- Modal Structure -->
    <div id="rewardModal" class="modal">
        <div class="modal-content">
            <a class="close modal-close"><i class="fas fa-times"></i></a>
            <h4 class="reward-title">你的赏识是我前进的动力</h4>
            <div class="reward-content">
                <div class="reward-tabs">
                    <ul class="tabs row">
                        <li class="tab col s6 alipay-tab waves-effect waves-light"><a href="#alipay">支付宝</a></li>
                        <li class="tab col s6 wechat-tab waves-effect waves-light"><a href="#wechat">微 信</a></li>
                    </ul>
                    <div id="alipay">
                        <img src="https://cdn.jsdelivr.net/gh/DongZhouGu/DongZhouGu.github.io/medias/reward/alipay.jpg" class="reward-img" alt="支付宝打赏二维码">
                    </div>
                    <div id="wechat">
                        <img src="https://cdn.jsdelivr.net/gh/DongZhouGu/DongZhouGu.github.io/medias/reward/wechat.png" class="reward-img" alt="微信打赏二维码">
                    </div>
                </div>
            </div>
        </div>
    </div>
</div>

<script>
    $(function () {
        $('.tabs').tabs();
    });
</script>

            
        </div>
    </div>

    

    

    

    

    
        <style>
    .valine-card {
        margin: 1.5rem auto;
    }

    .valine-card .card-content {
        padding: 20px 20px 5px 20px;
    }

    #vcomments textarea {
        box-sizing: border-box;
        background: url("/medias/comment_bg.png") 100% 100% no-repeat;
    }

    #vcomments p {
        margin: 2px 2px 10px;
        font-size: 1.05rem;
        line-height: 1.78rem;
    }

    #vcomments blockquote p {
        text-indent: 0.2rem;
    }

    #vcomments a {
        padding: 0 2px;
        color: #4cbf30;
        font-weight: 500;
        text-decoration: none;
    }

    #vcomments img {
        max-width: 100%;
        height: auto;
        cursor: pointer;
    }

    #vcomments ol li {
        list-style-type: decimal;
    }

    #vcomments ol,
    ul {
        display: block;
        padding-left: 2em;
        word-spacing: 0.05rem;
    }

    #vcomments ul li,
    ol li {
        display: list-item;
        line-height: 1.8rem;
        font-size: 1rem;
    }

    #vcomments ul li {
        list-style-type: disc;
    }

    #vcomments ul ul li {
        list-style-type: circle;
    }

    #vcomments table, th, td {
        padding: 12px 13px;
        border: 1px solid #dfe2e5;
    }

    #vcomments table, th, td {
        border: 0;
    }

    table tr:nth-child(2n), thead {
        background-color: #fafafa;
    }

    #vcomments table th {
        background-color: #f2f2f2;
        min-width: 80px;
    }

    #vcomments table td {
        min-width: 80px;
    }

    #vcomments h1 {
        font-size: 1.85rem;
        font-weight: bold;
        line-height: 2.2rem;
    }

    #vcomments h2 {
        font-size: 1.65rem;
        font-weight: bold;
        line-height: 1.9rem;
    }

    #vcomments h3 {
        font-size: 1.45rem;
        font-weight: bold;
        line-height: 1.7rem;
    }

    #vcomments h4 {
        font-size: 1.25rem;
        font-weight: bold;
        line-height: 1.5rem;
    }

    #vcomments h5 {
        font-size: 1.1rem;
        font-weight: bold;
        line-height: 1.4rem;
    }

    #vcomments h6 {
        font-size: 1rem;
        line-height: 1.3rem;
    }

    #vcomments p {
        font-size: 1rem;
        line-height: 1.5rem;
    }

    #vcomments hr {
        margin: 12px 0;
        border: 0;
        border-top: 1px solid #ccc;
    }

    #vcomments blockquote {
        margin: 15px 0;
        border-left: 5px solid #42b983;
        padding: 1rem 0.8rem 0.3rem 0.8rem;
        color: #666;
        background-color: rgba(66, 185, 131, .1);
    }

    #vcomments pre {
        font-family: monospace, monospace;
        padding: 1.2em;
        margin: .5em 0;
        background: #272822;
        overflow: auto;
        border-radius: 0.3em;
        tab-size: 4;
    }

    #vcomments code {
        font-family: monospace, monospace;
        padding: 1px 3px;
        font-size: 0.92rem;
        color: #e96900;
        background-color: #f8f8f8;
        border-radius: 2px;
    }

    #vcomments pre code {
        font-family: monospace, monospace;
        padding: 0;
        color: #e8eaf6;
        background-color: #272822;
    }

    #vcomments pre[class*="language-"] {
        padding: 1.2em;
        margin: .5em 0;
    }

    #vcomments code[class*="language-"],
    pre[class*="language-"] {
        color: #e8eaf6;
    }

    #vcomments [type="checkbox"]:not(:checked), [type="checkbox"]:checked {
        position: inherit;
        margin-left: -1.3rem;
        margin-right: 0.4rem;
        margin-top: -1px;
        vertical-align: middle;
        left: unset;
        visibility: visible;
    }

    #vcomments b,
    strong {
        font-weight: bold;
    }

    #vcomments dfn {
        font-style: italic;
    }

    #vcomments small {
        font-size: 85%;
    }

    #vcomments cite {
        font-style: normal;
    }

    #vcomments mark {
        background-color: #fcf8e3;
        padding: .2em;
    }

    #vcomments table, th, td {
        padding: 12px 13px;
        border: 1px solid #dfe2e5;
    }

    table tr:nth-child(2n), thead {
        background-color: #fafafa;
    }

    #vcomments table th {
        background-color: #f2f2f2;
        min-width: 80px;
    }

    #vcomments table td {
        min-width: 80px;
    }

    #vcomments [type="checkbox"]:not(:checked), [type="checkbox"]:checked {
        position: inherit;
        margin-left: -1.3rem;
        margin-right: 0.4rem;
        margin-top: -1px;
        vertical-align: middle;
        left: unset;
        visibility: visible;
    }
</style>

<div class="card valine-card" data-aos="fade-up">
    <div class="comment_headling" style="font-size: 20px; font-weight: 700; position: relative; padding-left: 20px; top: 15px; padding-bottom: 5px;">
        <i class="fas fa-comments fa-fw" aria-hidden="true"></i>
        <span>评论</span>
    </div>
    <div id="vcomments" class="card-content" style="display: grid">
    </div>
</div>

<script src="/libs/valine/av-min.js"></script>
<script src="https://cdn.jsdelivr.net/gh/DongZhouGu/DongZhouGu.github.io/libs/valine/Valine.min.js"></script>
<script>
    new Valine({
        el: '#vcomments',
        appId: 'RPCuj0HNm1eqAREO6c5T7nSJ-gzGzoHsz',
        appKey: 'laCdQbWLFWOWdkXVM3RxoXGe',
        notify: 'false' === 'true',
        verify: 'false' === 'true',
        visitor: 'true' === 'true',
        avatar: 'mm',
        pageSize: '10',
        lang: 'zh-cn',
        placeholder: '快来留言吧'
    });
</script>

    

    

    

<article id="prenext-posts" class="prev-next articles">
    <div class="row article-row">
        
        <div class="article col s12 m6" data-aos="fade-up">
            <div class="article-badge left-badge text-color">
                <i class="fas fa-chevron-left"></i>&nbsp;上一篇</div>
            <div class="card">
                <a href="/2020/06/30/scikit-learn-xi-lie-san-xian-xing-hui-gui/">
                    <div class="card-image">
                        
                        
                        <img src="https://cdn.jsdelivr.net/gh/DongZhouGu/DongZhouGu.github.io/medias/featureimages/13.jpg" class="responsive-img" alt="scikit-learn系列三：线性回归">
                        
                        <span class="card-title">scikit-learn系列三：线性回归</span>
                    </div>
                </a>
                <div class="card-content article-content">
                    <div class="summary block-with-text">
                        
                            实现线性回归算法，原理解释+房价预测案例
                        
                    </div>
                    <div class="publish-info">
                        <span class="publish-date">
                            <i class="far fa-clock fa-fw icon-date"></i>2020-06-30
                        </span>
                        <span class="publish-author">
                            
                            <i class="fas fa-bookmark fa-fw icon-category"></i>
                            
                            <a href="/categories/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0/" class="post-category">
                                    机器学习
                                </a>
                            
                            
                        </span>
                    </div>
                </div>
                
                <div class="card-action article-tags">
                    
                    <a href="/tags/%E5%9F%BA%E7%A1%80%E7%9F%A5%E8%AF%86/">
                        <span class="chip bg-color">基础知识</span>
                    </a>
                    
                    <a href="/tags/ML%E7%AE%97%E6%B3%95/">
                        <span class="chip bg-color">ML算法</span>
                    </a>
                    
                </div>
                
            </div>
        </div>
        
        
        <div class="article col s12 m6" data-aos="fade-up">
            <div class="article-badge right-badge text-color">
                下一篇&nbsp;<i class="fas fa-chevron-right"></i>
            </div>
            <div class="card">
                <a href="/2020/06/28/scikit-learn-xi-lie-yi-ji-qi-xue-xi-ji-chu/">
                    <div class="card-image">
                        
                        
                        <img src="https://cdn.jsdelivr.net/gh/DongZhouGu/DongZhouGu.github.io/medias/featureimages/0.jpg" class="responsive-img" alt="scikit-learn系列一：机器学习基础">
                        
                        <span class="card-title">scikit-learn系列一：机器学习基础</span>
                    </div>
                </a>
                <div class="card-content article-content">
                    <div class="summary block-with-text">
                        
                            介绍机器学习的基本概念与分类等
                        
                    </div>
                    <div class="publish-info">
                            <span class="publish-date">
                                <i class="far fa-clock fa-fw icon-date"></i>2020-06-28
                            </span>
                        <span class="publish-author">
                            
                            <i class="fas fa-bookmark fa-fw icon-category"></i>
                            
                            <a href="/categories/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0/" class="post-category">
                                    机器学习
                                </a>
                            
                            
                        </span>
                    </div>
                </div>
                
                <div class="card-action article-tags">
                    
                    <a href="/tags/%E5%9F%BA%E7%A1%80%E7%9F%A5%E8%AF%86/">
                        <span class="chip bg-color">基础知识</span>
                    </a>
                    
                </div>
                
            </div>
        </div>
        
    </div>
</article>

</div>



<!-- 代码块功能依赖 -->
<script type="text/javascript" src="/libs/codeBlock/codeBlockFuction.js"></script>

<!-- 代码语言 -->

<script type="text/javascript" src="/libs/codeBlock/codeLang.js"></script>


<!-- 代码块复制 -->

<script type="text/javascript" src="/libs/codeBlock/codeCopy.js"></script>


<!-- 代码块收缩 -->

<script type="text/javascript" src="/libs/codeBlock/codeShrink.js"></script>


<!-- 代码块折行 -->

<style type="text/css">
code[class*="language-"], pre[class*="language-"] { white-space: pre !important; }
</style>


    </div>
    <div id="toc-aside" class="expanded col l3 hide-on-med-and-down">
        <div class="toc-widget">
            <div class="toc-title"><i class="far fa-list-alt"></i>&nbsp;&nbsp;目录</div>
            <div id="toc-content"></div>
        </div>
    </div>
</div>

<!-- TOC 悬浮按钮. -->

<div id="floating-toc-btn" class="hide-on-med-and-down">
    <a class="btn-floating btn-large bg-color">
        <i class="fas fa-list-ul"></i>
    </a>
</div>


<script src="https://cdn.jsdelivr.net/gh/DongZhouGu/DongZhouGu.github.io/libs/tocbot/tocbot.min.js"></script>
<script>
    $(function () {
        tocbot.init({
            tocSelector: '#toc-content',
            contentSelector: '#articleContent',
            headingsOffset: -($(window).height() * 0.4 - 45),
            collapseDepth: Number('0'),
            headingSelector: 'h2, h3, h4'
        });

        // modify the toc link href to support Chinese.
        let i = 0;
        let tocHeading = 'toc-heading-';
        $('#toc-content a').each(function () {
            $(this).attr('href', '#' + tocHeading + (++i));
        });

        // modify the heading title id to support Chinese.
        i = 0;
        $('#articleContent').children('h2, h3, h4').each(function () {
            $(this).attr('id', tocHeading + (++i));
        });

        // Set scroll toc fixed.
        let tocHeight = parseInt($(window).height() * 0.4 - 64);
        let $tocWidget = $('.toc-widget');
        $(window).scroll(function () {
            let scroll = $(window).scrollTop();
            /* add post toc fixed. */
            if (scroll > tocHeight) {
                $tocWidget.addClass('toc-fixed');
            } else {
                $tocWidget.removeClass('toc-fixed');
            }
        });

        
        /* 修复文章卡片 div 的宽度. */
        let fixPostCardWidth = function (srcId, targetId) {
            let srcDiv = $('#' + srcId);
            if (srcDiv.length === 0) {
                return;
            }

            let w = srcDiv.width();
            if (w >= 450) {
                w = w + 21;
            } else if (w >= 350 && w < 450) {
                w = w + 18;
            } else if (w >= 300 && w < 350) {
                w = w + 16;
            } else {
                w = w + 14;
            }
            $('#' + targetId).width(w);
        };

        // 切换TOC目录展开收缩的相关操作.
        const expandedClass = 'expanded';
        let $tocAside = $('#toc-aside');
        let $mainContent = $('#main-content');
        $('#floating-toc-btn .btn-floating').click(function () {
            if ($tocAside.hasClass(expandedClass)) {
                $tocAside.removeClass(expandedClass).hide();
                $mainContent.removeClass('l9');
            } else {
                $tocAside.addClass(expandedClass).show();
                $mainContent.addClass('l9');
            }
            fixPostCardWidth('artDetail', 'prenext-posts');
        });
        
    });
</script>

    

</main>


<script src="https://cdn.bootcss.com/mathjax/2.7.5/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
<script>
    MathJax.Hub.Config({
        tex2jax: {inlineMath: [['$', '$'], ['\(', '\)']]}
    });
</script>



<footer class="page-footer bg-color">
    
    <div class="container row center-align" style="margin-bottom: 15px !important;">
        <div class="col s12 m8 l8 copy-right">
            Copyright&nbsp;&copy;
            <span id="year">2020</span>
            <a href="/about" target="_blank">DongZhou</a>
            |&nbsp;Powered by&nbsp;<a href="https://hexo.io/" target="_blank">Hexo</a>
            |&nbsp;Theme&nbsp;<a href="https://github.com/blinkfox/hexo-theme-matery" target="_blank">Matery</a>
            <br>
            
            &nbsp;<i class="fas fa-chart-area"></i>&nbsp;站点总字数:&nbsp;<span
                class="white-color">60.5k</span>&nbsp;字
            
            
            
            
            
            
            <span id="busuanzi_container_site_pv">
                |&nbsp;<i class="far fa-eye"></i>&nbsp;总访问量:&nbsp;<span id="busuanzi_value_site_pv"
                    class="white-color"></span>&nbsp;次
            </span>
            
            
            <span id="busuanzi_container_site_uv">
                |&nbsp;<i class="fas fa-users"></i>&nbsp;总访问人数:&nbsp;<span id="busuanzi_value_site_uv"
                    class="white-color"></span>&nbsp;人
            </span>
            
            <br>
            
            <span id="sitetime">载入运行时间...</span>
            <script>
                function siteTime() {
                    var seconds = 1000;
                    var minutes = seconds * 60;
                    var hours = minutes * 60;
                    var days = hours * 24;
                    var years = days * 365;
                    var today = new Date();
                    var startYear = "2020";
                    var startMonth = "6";
                    var startDate = "27";
                    var startHour = "0";
                    var startMinute = "0";
                    var startSecond = "0";
                    var todayYear = today.getFullYear();
                    var todayMonth = today.getMonth() + 1;
                    var todayDate = today.getDate();
                    var todayHour = today.getHours();
                    var todayMinute = today.getMinutes();
                    var todaySecond = today.getSeconds();
                    var t1 = Date.UTC(startYear, startMonth, startDate, startHour, startMinute, startSecond);
                    var t2 = Date.UTC(todayYear, todayMonth, todayDate, todayHour, todayMinute, todaySecond);
                    var diff = t2 - t1;
                    var diffYears = Math.floor(diff / years);
                    var diffDays = Math.floor((diff / days) - diffYears * 365);
                    var diffHours = Math.floor((diff - (diffYears * 365 + diffDays) * days) / hours);
                    var diffMinutes = Math.floor((diff - (diffYears * 365 + diffDays) * days - diffHours * hours) /
                        minutes);
                    var diffSeconds = Math.floor((diff - (diffYears * 365 + diffDays) * days - diffHours * hours -
                        diffMinutes * minutes) / seconds);
                    if (startYear == todayYear) {
                        document.getElementById("year").innerHTML = todayYear;
                        document.getElementById("sitetime").innerHTML = "本站已安全运行 " + diffDays + " 天 " + diffHours +
                            " 小时 " + diffMinutes + " 分钟 " + diffSeconds + " 秒";
                    } else {
                        document.getElementById("year").innerHTML = startYear + " - " + todayYear;
                        document.getElementById("sitetime").innerHTML = "本站已安全运行 " + diffYears + " 年 " + diffDays +
                            " 天 " + diffHours + " 小时 " + diffMinutes + " 分钟 " + diffSeconds + " 秒";
                    }
                }
                setInterval(siteTime, 1000);
            </script>
            
            <br>
            
        </div>
        <div class="col s12 m4 l4 social-link social-statis">
    <a href="https://github.com/DongZhouGu" class="tooltipped" target="_blank" data-tooltip="访问我的GitHub" data-position="top" data-delay="50">
        <i class="fab fa-github"></i>
    </a>



    <a href="mailto:gdz678@163.com" class="tooltipped" target="_blank" data-tooltip="邮件联系我" data-position="top" data-delay="50">
        <i class="fas fa-envelope-open"></i>
    </a>







    <a href="tencent://AddContact/?fromId=50&fromSubId=1&subcmd=all&uin=1596586942" class="tooltipped" target="_blank" data-tooltip="QQ联系我: 1596586942" data-position="top" data-delay="50">
        <i class="fab fa-qq"></i>
    </a>







    <a href="/atom.xml" class="tooltipped" target="_blank" data-tooltip="RSS 订阅" data-position="top" data-delay="50">
        <i class="fas fa-rss"></i>
    </a>

</div>
    </div>
</footer>

<div class="progress-bar"></div>


<!-- 搜索遮罩框 -->
<div id="searchModal" class="modal">
    <div class="modal-content">
        <div class="search-header">
            <span class="title"><i class="fas fa-search"></i>&nbsp;&nbsp;搜索</span>
            <input type="search" id="searchInput" name="s" placeholder="请输入搜索的关键字"
                   class="search-input">
        </div>
        <div id="searchResult"></div>
    </div>
</div>

<script src="/js/search.js"></script>
<script type="text/javascript">
$(function () {
    searchFunc("/search.xml", 'searchInput', 'searchResult');
});
</script>

<!-- 回到顶部按钮 -->
<div id="backTop" class="top-scroll">
    <a class="btn-floating btn-large waves-effect waves-light" href="#!">
        <i class="fas fa-arrow-up"></i>
    </a>
</div>


<script src=" https://cdn.jsdelivr.net/gh/DongZhouGu/DongZhouGu.github.io/libs/materialize/materialize.min.js"></script>
<script src=" https://cdn.jsdelivr.net/gh/DongZhouGu/DongZhouGu.github.io/libs/masonry/masonry.pkgd.min.js"></script>
<script src=" https://cdn.jsdelivr.net/gh/DongZhouGu/DongZhouGu.github.io/libs/aos/aos.js"></script>
<script src=" https://cdn.jsdelivr.net/gh/DongZhouGu/DongZhouGu.github.io/libs/scrollprogress/scrollProgress.min.js"></script>
<script src=" https://cdn.jsdelivr.net/gh/DongZhouGu/DongZhouGu.github.io/libs/lightGallery/js/lightgallery-all.min.js"></script>
<script src=" /js/matery.js"></script>

<!-- Baidu Analytics -->

<!-- Baidu Push -->

<script>
    (function () {
        var bp = document.createElement('script');
        var curProtocol = window.location.protocol.split(':')[0];
        if (curProtocol === 'https') {
            bp.src = 'https://zz.bdstatic.com/linksubmit/push.js';
        } else {
            bp.src = 'http://push.zhanzhang.baidu.com/push.js';
        }
        var s = document.getElementsByTagName("script")[0];
        s.parentNode.insertBefore(bp, s);
    })();
</script>


<script src=" https://cdn.jsdelivr.net/gh/DongZhouGu/DongZhouGu.github.io/libs/others/clicklove.js" async="async"></script>


<script async src=" https://cdn.jsdelivr.net/gh/DongZhouGu/DongZhouGu.github.io/libs/others/busuanzi.pure.mini.js"></script>













<script src=" https://cdn.jsdelivr.net/gh/DongZhouGu/DongZhouGu.github.io/libs/instantpage/instantpage.js" type="module"></script>


<script>
            window.imageLazyLoadSetting = {
                isSPA: false,
                processImages: null,
            };
        </script><script>window.addEventListener("load",function(){var t=/\.(gif|jpg|jpeg|tiff|png)$/i,r=/^data:image\/[a-z]+;base64,/;Array.prototype.slice.call(document.querySelectorAll("img[data-original]")).forEach(function(a){var e=a.parentNode;"A"===e.tagName&&(e.href.match(t)||e.href.match(r))&&(e.href=a.dataset.original)})});</script><script>!function(n){n.imageLazyLoadSetting.processImages=o;var i=n.imageLazyLoadSetting.isSPA,r=Array.prototype.slice.call(document.querySelectorAll("img[data-original]"));function o(){i&&(r=Array.prototype.slice.call(document.querySelectorAll("img[data-original]")));for(var t,e,a=0;a<r.length;a++)t=r[a],e=void 0,0<=(e=t.getBoundingClientRect()).bottom&&0<=e.left&&e.top<=(n.innerHeight||document.documentElement.clientHeight)&&function(){var t,e,n,i,o=r[a];t=o,e=function(){r=r.filter(function(t){return o!==t})},n=new Image,i=t.getAttribute("data-original"),n.onload=function(){t.src=i,e&&e()},n.src=i}()}o(),n.addEventListener("scroll",function(){var t,e;t=o,e=n,clearTimeout(t.tId),t.tId=setTimeout(function(){t.call(e)},500)})}(this);</script></body>

</html>
